mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-24 08:45:13 +00:00
support reading new, improved hashindex header format, fixes #6960 Bit of a pain to work with that code: - C code - needs to still be able to read the old hashindex file format, - while also supporting the new file format. - the hash computed while reading the file causes additional problems because it expects all places in the file get read exactly once and in sequential order. I solved this by separately opening the file in the python part of the code and checking for the magic. BORG_IDX means the legacy file format and legacy layout of the hashtable, BORG2IDX means the new file format and the new layout of the hashtable. Done: - added a version int32 directly after the magic and set it to 2 (like borg 2). the old header had no version info, but could be denoted as version 1 in case we ever need it (currently it decides based on the magic). - added num_empty as indicated by a TODO in count_empty, so it does not need a full hashtable scan to determine the amount of empty buckets. - to keep it simpler, I just filled the HashHeader struct with a `char reserved[1024 - 32];` 1024 being the desired overall header size and 32 being the currently used size. this alignment might be useful in case we mmap() the hashindex file one day.
This commit is contained in:
parent
5edc53d4d4
commit
c29d4a096b
5 changed files with 268 additions and 84 deletions
|
@ -19,7 +19,8 @@
|
|||
# define BORG_PACKED(x) x __attribute__((packed))
|
||||
#endif
|
||||
|
||||
#define MAGIC "BORG_IDX"
|
||||
#define MAGIC "BORG2IDX"
|
||||
#define MAGIC1 "BORG_IDX" // legacy
|
||||
#define MAGIC_LEN 8
|
||||
|
||||
#define DEBUG 0
|
||||
|
@ -39,6 +40,18 @@ typedef struct {
|
|||
int32_t num_buckets;
|
||||
int8_t key_size;
|
||||
int8_t value_size;
|
||||
}) HashHeader1;
|
||||
|
||||
BORG_PACKED(
|
||||
typedef struct {
|
||||
char magic[MAGIC_LEN];
|
||||
int32_t version;
|
||||
int32_t num_entries;
|
||||
int32_t num_buckets;
|
||||
int32_t num_empty;
|
||||
int32_t key_size;
|
||||
int32_t value_size;
|
||||
char reserved[1024 - 32]; // filler to 1024 bytes total
|
||||
}) HashHeader;
|
||||
|
||||
typedef struct {
|
||||
|
@ -110,8 +123,8 @@ static int hash_sizes[] = {
|
|||
#define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno))
|
||||
|
||||
#ifndef BORG_NO_PYTHON
|
||||
static HashIndex *hashindex_read(PyObject *file_py, int permit_compact);
|
||||
static void hashindex_write(HashIndex *index, PyObject *file_py);
|
||||
static HashIndex *hashindex_read(PyObject *file_py, int permit_compact, int legacy);
|
||||
static void hashindex_write(HashIndex *index, PyObject *file_py, int legacy);
|
||||
#endif
|
||||
|
||||
static uint64_t hashindex_compact(HashIndex *index);
|
||||
|
@ -265,9 +278,7 @@ int shrink_size(int current){
|
|||
|
||||
int
|
||||
count_empty(HashIndex *index)
|
||||
{ /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones).
|
||||
* TODO: if we ever change HashHeader, save the count there so we do not need this function.
|
||||
*/
|
||||
{ /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones). */
|
||||
int i, count = 0, capacity = index->num_buckets;
|
||||
for(i = 0; i < capacity; i++) {
|
||||
if(BUCKET_IS_EMPTY(index, i))
|
||||
|
@ -276,19 +287,16 @@ count_empty(HashIndex *index)
|
|||
return count;
|
||||
}
|
||||
|
||||
/* Public API */
|
||||
|
||||
#ifndef BORG_NO_PYTHON
|
||||
static HashIndex *
|
||||
hashindex_read(PyObject *file_py, int permit_compact)
|
||||
HashIndex *
|
||||
read_hashheader1(PyObject *file_py)
|
||||
{
|
||||
Py_ssize_t length, buckets_length, bytes_read;
|
||||
Py_ssize_t bytes_read, length, buckets_length;
|
||||
Py_buffer header_buffer;
|
||||
PyObject *header_bytes, *length_object, *bucket_bytes, *tmp;
|
||||
HashHeader *header;
|
||||
PyObject *header_bytes, *length_object, *tmp;
|
||||
HashIndex *index = NULL;
|
||||
HashHeader1 *header;
|
||||
|
||||
header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(HashHeader));
|
||||
header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
|
||||
if(!header_bytes) {
|
||||
assert(PyErr_Occurred());
|
||||
goto fail;
|
||||
|
@ -299,11 +307,11 @@ hashindex_read(PyObject *file_py, int permit_compact)
|
|||
/* TypeError, not a bytes() object */
|
||||
goto fail_decref_header;
|
||||
}
|
||||
if(bytes_read != sizeof(HashHeader)) {
|
||||
if(bytes_read != sizeof(*header)) {
|
||||
/* Truncated file */
|
||||
/* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
|
||||
PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
|
||||
sizeof(HashHeader), bytes_read);
|
||||
sizeof(*header), bytes_read);
|
||||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
|
@ -334,7 +342,111 @@ hashindex_read(PyObject *file_py, int permit_compact)
|
|||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(HashHeader), SEEK_SET);
|
||||
tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
|
||||
Py_XDECREF(tmp);
|
||||
if(PyErr_Occurred()) {
|
||||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
/* Set up the in-memory header */
|
||||
if(!(index = malloc(sizeof(HashIndex)))) {
|
||||
PyErr_NoMemory();
|
||||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
PyObject_GetBuffer(header_bytes, &header_buffer, PyBUF_SIMPLE);
|
||||
if(PyErr_Occurred()) {
|
||||
goto fail_free_index;
|
||||
}
|
||||
|
||||
header = (HashHeader1*) header_buffer.buf;
|
||||
if(memcmp(header->magic, MAGIC1, MAGIC_LEN)) {
|
||||
PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header");
|
||||
goto fail_release_header_buffer;
|
||||
}
|
||||
|
||||
buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size);
|
||||
if((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
|
||||
PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
|
||||
sizeof(*header) + buckets_length, length);
|
||||
goto fail_release_header_buffer;
|
||||
}
|
||||
|
||||
index->num_entries = _le32toh(header->num_entries);
|
||||
index->num_buckets = _le32toh(header->num_buckets);
|
||||
index->num_empty = -1; // unknown, needs counting
|
||||
index->key_size = header->key_size;
|
||||
index->value_size = header->value_size;
|
||||
|
||||
fail_release_header_buffer:
|
||||
PyBuffer_Release(&header_buffer);
|
||||
fail_free_index:
|
||||
if(PyErr_Occurred()) {
|
||||
free(index);
|
||||
index = NULL;
|
||||
}
|
||||
fail_decref_header:
|
||||
Py_DECREF(header_bytes);
|
||||
fail:
|
||||
return index;
|
||||
}
|
||||
|
||||
HashIndex *
|
||||
read_hashheader(PyObject *file_py)
|
||||
{
|
||||
Py_ssize_t bytes_read, length, buckets_length;
|
||||
Py_buffer header_buffer;
|
||||
PyObject *header_bytes, *length_object, *tmp;
|
||||
HashIndex *index = NULL;
|
||||
HashHeader *header;
|
||||
|
||||
header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header));
|
||||
if(!header_bytes) {
|
||||
assert(PyErr_Occurred());
|
||||
goto fail;
|
||||
}
|
||||
|
||||
bytes_read = PyBytes_Size(header_bytes);
|
||||
if(PyErr_Occurred()) {
|
||||
/* TypeError, not a bytes() object */
|
||||
goto fail_decref_header;
|
||||
}
|
||||
if(bytes_read != sizeof(*header)) {
|
||||
/* Truncated file */
|
||||
/* Note: %zd is the format for Py_ssize_t, %zu is for size_t */
|
||||
PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)",
|
||||
sizeof(*header), bytes_read);
|
||||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hash the header
|
||||
* If the header is corrupted this bails before doing something stupid (like allocating 3.8 TB of memory)
|
||||
*/
|
||||
tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader");
|
||||
Py_XDECREF(tmp);
|
||||
if(PyErr_Occurred()) {
|
||||
if(PyErr_ExceptionMatches(PyExc_AttributeError)) {
|
||||
/* Be able to work with regular file objects which do not have a hash_part method. */
|
||||
PyErr_Clear();
|
||||
} else {
|
||||
goto fail_decref_header;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find length of file */
|
||||
length_object = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)0, SEEK_END);
|
||||
if(PyErr_Occurred()) {
|
||||
goto fail_decref_header;
|
||||
}
|
||||
length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
|
||||
Py_DECREF(length_object);
|
||||
if(PyErr_Occurred()) {
|
||||
/* This shouldn't generally happen; but can if seek() returns something that's not a number */
|
||||
goto fail_decref_header;
|
||||
}
|
||||
|
||||
tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET);
|
||||
Py_XDECREF(tmp);
|
||||
if(PyErr_Occurred()) {
|
||||
goto fail_decref_header;
|
||||
|
@ -357,17 +469,58 @@ hashindex_read(PyObject *file_py, int permit_compact)
|
|||
goto fail_release_header_buffer;
|
||||
}
|
||||
|
||||
buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size);
|
||||
if((Py_ssize_t)length != (Py_ssize_t)sizeof(HashHeader) + buckets_length) {
|
||||
buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) *
|
||||
(_le32toh(header->key_size) + _le32toh(header->value_size));
|
||||
if ((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) {
|
||||
PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)",
|
||||
sizeof(HashHeader) + buckets_length, length);
|
||||
sizeof(*header) + buckets_length, length);
|
||||
goto fail_release_header_buffer;
|
||||
}
|
||||
|
||||
index->num_entries = _le32toh(header->num_entries);
|
||||
index->num_buckets = _le32toh(header->num_buckets);
|
||||
index->key_size = header->key_size;
|
||||
index->value_size = header->value_size;
|
||||
index->num_empty = _le32toh(header->num_empty);
|
||||
index->key_size = _le32toh(header->key_size);
|
||||
index->value_size = _le32toh(header->value_size);
|
||||
|
||||
int header_version = _le32toh(header->version);
|
||||
if (header_version != 2) {
|
||||
PyErr_Format(PyExc_ValueError, "Unsupported header version (expected %d, got %d)",
|
||||
2, header_version);
|
||||
goto fail_release_header_buffer;
|
||||
}
|
||||
|
||||
fail_release_header_buffer:
|
||||
PyBuffer_Release(&header_buffer);
|
||||
fail_free_index:
|
||||
if(PyErr_Occurred()) {
|
||||
free(index);
|
||||
index = NULL;
|
||||
}
|
||||
fail_decref_header:
|
||||
Py_DECREF(header_bytes);
|
||||
fail:
|
||||
return index;
|
||||
}
|
||||
|
||||
/* Public API */
|
||||
|
||||
#ifndef BORG_NO_PYTHON
|
||||
static HashIndex *
|
||||
hashindex_read(PyObject *file_py, int permit_compact, int legacy)
|
||||
{
|
||||
Py_ssize_t buckets_length, bytes_read;
|
||||
PyObject *bucket_bytes;
|
||||
HashIndex *index = NULL;
|
||||
|
||||
if (legacy)
|
||||
index = read_hashheader1(file_py);
|
||||
else
|
||||
index = read_hashheader(file_py);
|
||||
|
||||
if (!index)
|
||||
goto fail;
|
||||
|
||||
index->bucket_size = index->key_size + index->value_size;
|
||||
index->lower_limit = get_lower_limit(index->num_buckets);
|
||||
index->upper_limit = get_upper_limit(index->num_buckets);
|
||||
|
@ -381,10 +534,11 @@ hashindex_read(PyObject *file_py, int permit_compact)
|
|||
* will issue multiple underlying reads if necessary. This supports indices
|
||||
* >2 GB on Linux. We also compare lengths later.
|
||||
*/
|
||||
buckets_length = (Py_ssize_t)(index->num_buckets) * (index->key_size + index->value_size);
|
||||
bucket_bytes = PyObject_CallMethod(file_py, "read", "n", buckets_length);
|
||||
if(!bucket_bytes) {
|
||||
assert(PyErr_Occurred());
|
||||
goto fail_release_header_buffer;
|
||||
goto fail_free_index;
|
||||
}
|
||||
bytes_read = PyBytes_Size(bucket_bytes);
|
||||
if(PyErr_Occurred()) {
|
||||
|
@ -404,7 +558,8 @@ hashindex_read(PyObject *file_py, int permit_compact)
|
|||
|
||||
if(!permit_compact) {
|
||||
index->min_empty = get_min_empty(index->num_buckets);
|
||||
index->num_empty = count_empty(index);
|
||||
if (index->num_empty == -1) // we read a legacy index without num_empty value
|
||||
index->num_empty = count_empty(index);
|
||||
|
||||
if(index->num_empty < index->min_empty) {
|
||||
/* too many tombstones here / not enough empty buckets, do a same-size rebuild */
|
||||
|
@ -426,15 +581,11 @@ fail_free_buckets:
|
|||
}
|
||||
fail_decref_buckets:
|
||||
Py_DECREF(bucket_bytes);
|
||||
fail_release_header_buffer:
|
||||
PyBuffer_Release(&header_buffer);
|
||||
fail_free_index:
|
||||
if(PyErr_Occurred()) {
|
||||
free(index);
|
||||
index = NULL;
|
||||
}
|
||||
fail_decref_header:
|
||||
Py_DECREF(header_bytes);
|
||||
fail:
|
||||
return index;
|
||||
}
|
||||
|
@ -481,33 +632,37 @@ hashindex_free(HashIndex *index)
|
|||
free(index);
|
||||
}
|
||||
|
||||
#ifndef BORG_NO_PYTHON
|
||||
static void
|
||||
hashindex_write(HashIndex *index, PyObject *file_py)
|
||||
int
|
||||
write_hashheader(HashIndex *index, PyObject *file_py)
|
||||
{
|
||||
PyObject *length_object, *buckets_view, *tmp;
|
||||
PyObject *length_object, *tmp;
|
||||
Py_ssize_t length;
|
||||
Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size;
|
||||
|
||||
_Static_assert(sizeof(HashHeader) == 1024, "HashHeader struct should be exactly 1024 bytes in size");
|
||||
|
||||
HashHeader header = {
|
||||
.magic = MAGIC,
|
||||
.version = _htole32(2),
|
||||
.num_entries = _htole32(index->num_entries),
|
||||
.num_buckets = _htole32(index->num_buckets),
|
||||
.key_size = index->key_size,
|
||||
.value_size = index->value_size
|
||||
.num_empty = _htole32(index->num_empty),
|
||||
.key_size = _htole32(index->key_size),
|
||||
.value_size = _htole32(index->value_size),
|
||||
.reserved = {0}
|
||||
};
|
||||
|
||||
length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(HashHeader));
|
||||
length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(header));
|
||||
if(PyErr_Occurred()) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError);
|
||||
Py_DECREF(length_object);
|
||||
if(PyErr_Occurred()) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
if(length != sizeof(HashHeader)) {
|
||||
if(length != sizeof(header)) {
|
||||
PyErr_SetString(PyExc_ValueError, "Failed to write header");
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -520,9 +675,24 @@ hashindex_write(HashIndex *index, PyObject *file_py)
|
|||
/* Be able to work with regular file objects which do not have a hash_part method. */
|
||||
PyErr_Clear();
|
||||
} else {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef BORG_NO_PYTHON
|
||||
static void
|
||||
hashindex_write(HashIndex *index, PyObject *file_py, int legacy)
|
||||
{
|
||||
PyObject *length_object, *buckets_view;
|
||||
Py_ssize_t length;
|
||||
Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size;
|
||||
|
||||
assert(!legacy); // we do not ever write legacy hashindexes
|
||||
|
||||
if(!write_hashheader(index, file_py))
|
||||
return;
|
||||
|
||||
/* Note: explicitly construct view; BuildValue can convert (pointer, length) to Python objects, but copies them for doing so */
|
||||
buckets_view = PyMemoryView_FromMemory((char*)index->buckets, buckets_length, PyBUF_READ);
|
||||
|
@ -698,6 +868,7 @@ hashindex_compact(HashIndex *index)
|
|||
}
|
||||
|
||||
index->num_buckets = index->num_entries;
|
||||
index->num_empty = 0;
|
||||
return saved_size;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,12 +17,12 @@ cdef extern from "_hashindex.c":
|
|||
uint32_t version
|
||||
char hash[16]
|
||||
|
||||
HashIndex *hashindex_read(object file_py, int permit_compact) except *
|
||||
HashIndex *hashindex_read(object file_py, int permit_compact, int legacy) except *
|
||||
HashIndex *hashindex_init(int capacity, int key_size, int value_size)
|
||||
void hashindex_free(HashIndex *index)
|
||||
int hashindex_len(HashIndex *index)
|
||||
int hashindex_size(HashIndex *index)
|
||||
void hashindex_write(HashIndex *index, object file_py) except *
|
||||
void hashindex_write(HashIndex *index, object file_py, int legacy) except *
|
||||
unsigned char *hashindex_get(HashIndex *index, unsigned char *key)
|
||||
unsigned char *hashindex_next_key(HashIndex *index, unsigned char *key)
|
||||
int hashindex_delete(HashIndex *index, unsigned char *key)
|
||||
|
@ -75,21 +75,21 @@ assert _MAX_VALUE % 2 == 1
|
|||
def hashindex_variant(fn):
|
||||
"""peek into an index file and find out what it is"""
|
||||
with open(fn, 'rb') as f:
|
||||
hh = f.read(18) # len(HashHeader)
|
||||
magic = hh[0:8]
|
||||
magic = f.read(8) # MAGIC_LEN
|
||||
if magic == b'BORG_IDX':
|
||||
key_size = hh[16]
|
||||
value_size = hh[17]
|
||||
return f'k{key_size}_v{value_size}'
|
||||
return 1 # legacy
|
||||
if magic == b'BORG2IDX':
|
||||
return 2
|
||||
if magic == b'12345678': # used by unit tests
|
||||
return 'k32_v16' # just return the current variant
|
||||
raise ValueError(f'unknown hashindex format, magic: {magic!r}')
|
||||
return 2 # just return the current variant
|
||||
raise ValueError(f'unknown hashindex magic: {magic!r}')
|
||||
|
||||
|
||||
@cython.internal
|
||||
cdef class IndexBase:
|
||||
cdef HashIndex *index
|
||||
cdef int key_size
|
||||
legacy = 0
|
||||
|
||||
_key_size = 32
|
||||
|
||||
|
@ -101,9 +101,9 @@ cdef class IndexBase:
|
|||
if path:
|
||||
if isinstance(path, (str, bytes)):
|
||||
with open(path, 'rb') as fd:
|
||||
self.index = hashindex_read(fd, permit_compact)
|
||||
self.index = hashindex_read(fd, permit_compact, self.legacy)
|
||||
else:
|
||||
self.index = hashindex_read(path, permit_compact)
|
||||
self.index = hashindex_read(path, permit_compact, self.legacy)
|
||||
assert self.index, 'hashindex_read() returned NULL with no exception set'
|
||||
else:
|
||||
if usable is not None:
|
||||
|
@ -123,9 +123,9 @@ cdef class IndexBase:
|
|||
def write(self, path):
|
||||
if isinstance(path, (str, bytes)):
|
||||
with open(path, 'wb') as fd:
|
||||
hashindex_write(self.index, fd)
|
||||
hashindex_write(self.index, fd, self.legacy)
|
||||
else:
|
||||
hashindex_write(self.index, path)
|
||||
hashindex_write(self.index, path, self.legacy)
|
||||
|
||||
def clear(self):
|
||||
hashindex_free(self.index)
|
||||
|
@ -314,6 +314,7 @@ cdef class NSKeyIterator:
|
|||
|
||||
cdef class NSIndex1(IndexBase): # legacy borg 1.x
|
||||
|
||||
legacy = 1
|
||||
value_size = 8
|
||||
|
||||
def __getitem__(self, key):
|
||||
|
|
|
@ -543,9 +543,9 @@ def open_index(self, transaction_id, auto_recover=True):
|
|||
integrity_data = self._read_integrity(transaction_id, "index")
|
||||
try:
|
||||
with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
|
||||
if variant == "k32_v16":
|
||||
if variant == 2:
|
||||
return NSIndex.read(fd)
|
||||
if variant == "k32_v8": # legacy
|
||||
if variant == 1: # legacy
|
||||
return NSIndex1.read(fd)
|
||||
except (ValueError, OSError, FileIntegrityError) as exc:
|
||||
logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc)
|
||||
|
|
|
@ -108,15 +108,21 @@ def test_corrupted_ancillary(self, index, sync, data, error):
|
|||
|
||||
def make_index_with_refcount(self, refcount):
|
||||
index_data = io.BytesIO()
|
||||
index_data.write(b"BORG_IDX")
|
||||
index_data.write(b"BORG2IDX")
|
||||
# version
|
||||
index_data.write((2).to_bytes(4, "little"))
|
||||
# num_entries
|
||||
index_data.write((1).to_bytes(4, "little"))
|
||||
# num_buckets
|
||||
index_data.write((1).to_bytes(4, "little"))
|
||||
# num_empty
|
||||
index_data.write((0).to_bytes(4, "little"))
|
||||
# key_size
|
||||
index_data.write((32).to_bytes(1, "little"))
|
||||
index_data.write((32).to_bytes(4, "little"))
|
||||
# value_size
|
||||
index_data.write((3 * 4).to_bytes(1, "little"))
|
||||
index_data.write((3 * 4).to_bytes(4, "little"))
|
||||
# reserved
|
||||
index_data.write(bytes(1024 - 32))
|
||||
|
||||
index_data.write(H(0))
|
||||
index_data.write(refcount.to_bytes(4, "little"))
|
||||
|
|
|
@ -86,12 +86,12 @@ def _generic_test(self, cls, make_value, sha):
|
|||
|
||||
def test_nsindex(self):
|
||||
self._generic_test(
|
||||
NSIndex, lambda x: (x, x, x), "7d70671d0b7e9d2f51b2691ecf35184b9f8ecc1202cceb2748c905c8fc04c256"
|
||||
NSIndex, lambda x: (x, x, x), "0d7880dbe02b64f03c471e60e193a1333879b4f23105768b10c9222accfeac5e"
|
||||
)
|
||||
|
||||
def test_chunkindex(self):
|
||||
self._generic_test(
|
||||
ChunkIndex, lambda x: (x, x), "85f72b036c692c8266e4f51ccf0cff2147204282b5e316ae508d30a448d88fef"
|
||||
ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da"
|
||||
)
|
||||
|
||||
def test_resize(self):
|
||||
|
@ -252,7 +252,7 @@ def test_chunk_indexer(self):
|
|||
class HashIndexSizeTestCase(BaseTestCase):
|
||||
def test_size_on_disk(self):
|
||||
idx = ChunkIndex()
|
||||
assert idx.size() == 18 + 1031 * (32 + 2 * 4)
|
||||
assert idx.size() == 1024 + 1031 * (32 + 2 * 4)
|
||||
|
||||
def test_size_on_disk_accurate(self):
|
||||
idx = ChunkIndex()
|
||||
|
@ -368,12 +368,12 @@ def test_keyerror(self):
|
|||
|
||||
|
||||
class HashIndexDataTestCase(BaseTestCase):
|
||||
# This bytestring was created with borg2-pre 2022-06-10
|
||||
# This bytestring was created with borg2-pre 2022-09-30
|
||||
HASHINDEX = (
|
||||
b"eJzt0LEJg1AYhdE/JqBjOEJMNhBBrAQrO9ewc+HsoG+CPMsEz1cfbnHbceqXoZvvEVE+IuoqMu2pnOE4"
|
||||
b"juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4"
|
||||
b"juM4juM4juM4jruie36vuSVT5N0rzW0n9t7r5z9+4TiO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO"
|
||||
b"4ziO4ziO4ziO4ziO4ziO437LHbSVHGw="
|
||||
b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te"
|
||||
b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM"
|
||||
b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4"
|
||||
b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk="
|
||||
)
|
||||
|
||||
def _serialize_hashindex(self, idx):
|
||||
|
@ -439,17 +439,23 @@ def test_integrity_checked_file(self):
|
|||
|
||||
|
||||
class HashIndexCompactTestCase(HashIndexDataTestCase):
|
||||
def index(self, num_entries, num_buckets):
|
||||
def index(self, num_entries, num_buckets, num_empty):
|
||||
index_data = io.BytesIO()
|
||||
index_data.write(b"BORG_IDX")
|
||||
index_data.write(b"BORG2IDX")
|
||||
# version
|
||||
index_data.write((2).to_bytes(4, "little"))
|
||||
# num_entries
|
||||
index_data.write(num_entries.to_bytes(4, "little"))
|
||||
# num_buckets
|
||||
index_data.write(num_buckets.to_bytes(4, "little"))
|
||||
# num_empty
|
||||
index_data.write(num_empty.to_bytes(4, "little"))
|
||||
# key_size
|
||||
index_data.write((32).to_bytes(1, "little"))
|
||||
index_data.write((32).to_bytes(4, "little"))
|
||||
# value_size
|
||||
index_data.write((3 * 4).to_bytes(1, "little"))
|
||||
index_data.write((3 * 4).to_bytes(4, "little"))
|
||||
# reserved
|
||||
index_data.write(bytes(1024 - 32))
|
||||
|
||||
self.index_data = index_data
|
||||
|
||||
|
@ -481,7 +487,7 @@ def write_deleted(self, key):
|
|||
self.write_entry(key, 0xFFFFFFFE, 0, 0)
|
||||
|
||||
def test_simple(self):
|
||||
self.index(num_entries=3, num_buckets=6)
|
||||
self.index(num_entries=3, num_buckets=6, num_empty=2)
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_deleted(H2(1))
|
||||
self.write_empty(H2(2))
|
||||
|
@ -491,14 +497,14 @@ def test_simple(self):
|
|||
|
||||
compact_index = self.index_from_data_compact_to_data()
|
||||
|
||||
self.index(num_entries=3, num_buckets=3)
|
||||
self.index(num_entries=3, num_buckets=3, num_empty=0)
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_entry(H2(3), 5, 6, 7)
|
||||
self.write_entry(H2(4), 8, 9, 10)
|
||||
assert compact_index == self.index_data.getvalue()
|
||||
|
||||
def test_first_empty(self):
|
||||
self.index(num_entries=3, num_buckets=6)
|
||||
self.index(num_entries=3, num_buckets=6, num_empty=2)
|
||||
self.write_deleted(H2(1))
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_empty(H2(2))
|
||||
|
@ -508,14 +514,14 @@ def test_first_empty(self):
|
|||
|
||||
compact_index = self.index_from_data_compact_to_data()
|
||||
|
||||
self.index(num_entries=3, num_buckets=3)
|
||||
self.index(num_entries=3, num_buckets=3, num_empty=0)
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_entry(H2(3), 5, 6, 7)
|
||||
self.write_entry(H2(4), 8, 9, 10)
|
||||
assert compact_index == self.index_data.getvalue()
|
||||
|
||||
def test_last_used(self):
|
||||
self.index(num_entries=3, num_buckets=6)
|
||||
self.index(num_entries=3, num_buckets=6, num_empty=2)
|
||||
self.write_deleted(H2(1))
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_empty(H2(2))
|
||||
|
@ -525,14 +531,14 @@ def test_last_used(self):
|
|||
|
||||
compact_index = self.index_from_data_compact_to_data()
|
||||
|
||||
self.index(num_entries=3, num_buckets=3)
|
||||
self.index(num_entries=3, num_buckets=3, num_empty=0)
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_entry(H2(3), 5, 6, 7)
|
||||
self.write_entry(H2(4), 8, 9, 10)
|
||||
assert compact_index == self.index_data.getvalue()
|
||||
|
||||
def test_too_few_empty_slots(self):
|
||||
self.index(num_entries=3, num_buckets=6)
|
||||
self.index(num_entries=3, num_buckets=6, num_empty=2)
|
||||
self.write_deleted(H2(1))
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_entry(H2(3), 5, 6, 7)
|
||||
|
@ -542,14 +548,14 @@ def test_too_few_empty_slots(self):
|
|||
|
||||
compact_index = self.index_from_data_compact_to_data()
|
||||
|
||||
self.index(num_entries=3, num_buckets=3)
|
||||
self.index(num_entries=3, num_buckets=3, num_empty=0)
|
||||
self.write_entry(H2(0), 1, 2, 3)
|
||||
self.write_entry(H2(3), 5, 6, 7)
|
||||
self.write_entry(H2(4), 8, 9, 10)
|
||||
assert compact_index == self.index_data.getvalue()
|
||||
|
||||
def test_empty(self):
|
||||
self.index(num_entries=0, num_buckets=6)
|
||||
self.index(num_entries=0, num_buckets=6, num_empty=3)
|
||||
self.write_deleted(H2(1))
|
||||
self.write_empty(H2(0))
|
||||
self.write_deleted(H2(3))
|
||||
|
@ -559,7 +565,7 @@ def test_empty(self):
|
|||
|
||||
compact_index = self.index_from_data_compact_to_data()
|
||||
|
||||
self.index(num_entries=0, num_buckets=0)
|
||||
self.index(num_entries=0, num_buckets=0, num_empty=0)
|
||||
assert compact_index == self.index_data.getvalue()
|
||||
|
||||
def test_merge(self):
|
||||
|
@ -569,7 +575,7 @@ def test_merge(self):
|
|||
idx1[H(2)] = 2, 200
|
||||
idx1[H(3)] = 3, 300
|
||||
idx1.compact()
|
||||
assert idx1.size() == 18 + 3 * (32 + 2 * 4)
|
||||
assert idx1.size() == 1024 + 3 * (32 + 2 * 4)
|
||||
|
||||
master.merge(idx1)
|
||||
assert master[H(1)] == (1, 100)
|
||||
|
@ -612,7 +618,7 @@ def HH(x, y, z):
|
|||
for y in range(700): # stay below max load to not trigger resize
|
||||
idx[HH(0, y, 0)] = (0, y, 0)
|
||||
|
||||
assert idx.size() == 1031 * 48 + 18 # 1031 buckets + header
|
||||
assert idx.size() == 1024 + 1031 * 48 # header + 1031 buckets
|
||||
|
||||
# delete lots of the collisions, creating lots of tombstones
|
||||
for y in range(400): # stay above min load to not trigger resize
|
||||
|
|
Loading…
Reference in a new issue