From c29d4a096bba4018e14208b54fe24bdc886f4f3d Mon Sep 17 00:00:00 2001 From: TW Date: Sun, 2 Oct 2022 14:35:21 +0200 Subject: [PATCH] Hashindex header work, fixes #6960 (#7064) support reading new, improved hashindex header format, fixes #6960 Bit of a pain to work with that code: - C code - needs to still be able to read the old hashindex file format, - while also supporting the new file format. - the hash computed while reading the file causes additional problems because it expects all places in the file get read exactly once and in sequential order. I solved this by separately opening the file in the python part of the code and checking for the magic. BORG_IDX means the legacy file format and legacy layout of the hashtable, BORG2IDX means the new file format and the new layout of the hashtable. Done: - added a version int32 directly after the magic and set it to 2 (like borg 2). the old header had no version info, but could be denoted as version 1 in case we ever need it (currently it decides based on the magic). - added num_empty as indicated by a TODO in count_empty, so it does not need a full hashtable scan to determine the amount of empty buckets. - to keep it simpler, I just filled the HashHeader struct with a `char reserved[1024 - 32];` 1024 being the desired overall header size and 32 being the currently used size. this alignment might be useful in case we mmap() the hashindex file one day. --- src/borg/_hashindex.c | 255 ++++++++++++++++++++++++++------ src/borg/hashindex.pyx | 27 ++-- src/borg/repository.py | 4 +- src/borg/testsuite/cache.py | 12 +- src/borg/testsuite/hashindex.py | 54 ++++--- 5 files changed, 268 insertions(+), 84 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index dd704d2de..5a006f913 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -19,7 +19,8 @@ # define BORG_PACKED(x) x __attribute__((packed)) #endif -#define MAGIC "BORG_IDX" +#define MAGIC "BORG2IDX" +#define MAGIC1 "BORG_IDX" // legacy #define MAGIC_LEN 8 #define DEBUG 0 @@ -39,6 +40,18 @@ typedef struct { int32_t num_buckets; int8_t key_size; int8_t value_size; +}) HashHeader1; + +BORG_PACKED( +typedef struct { + char magic[MAGIC_LEN]; + int32_t version; + int32_t num_entries; + int32_t num_buckets; + int32_t num_empty; + int32_t key_size; + int32_t value_size; + char reserved[1024 - 32]; // filler to 1024 bytes total }) HashHeader; typedef struct { @@ -110,8 +123,8 @@ static int hash_sizes[] = { #define EPRINTF_PATH(path, msg, ...) fprintf(stderr, "hashindex: %s: " msg " (%s)\n", path, ##__VA_ARGS__, strerror(errno)) #ifndef BORG_NO_PYTHON -static HashIndex *hashindex_read(PyObject *file_py, int permit_compact); -static void hashindex_write(HashIndex *index, PyObject *file_py); +static HashIndex *hashindex_read(PyObject *file_py, int permit_compact, int legacy); +static void hashindex_write(HashIndex *index, PyObject *file_py, int legacy); #endif static uint64_t hashindex_compact(HashIndex *index); @@ -265,9 +278,7 @@ int shrink_size(int current){ int count_empty(HashIndex *index) -{ /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones). - * TODO: if we ever change HashHeader, save the count there so we do not need this function. - */ +{ /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones). */ int i, count = 0, capacity = index->num_buckets; for(i = 0; i < capacity; i++) { if(BUCKET_IS_EMPTY(index, i)) @@ -276,19 +287,16 @@ count_empty(HashIndex *index) return count; } -/* Public API */ - -#ifndef BORG_NO_PYTHON -static HashIndex * -hashindex_read(PyObject *file_py, int permit_compact) +HashIndex * +read_hashheader1(PyObject *file_py) { - Py_ssize_t length, buckets_length, bytes_read; + Py_ssize_t bytes_read, length, buckets_length; Py_buffer header_buffer; - PyObject *header_bytes, *length_object, *bucket_bytes, *tmp; - HashHeader *header; + PyObject *header_bytes, *length_object, *tmp; HashIndex *index = NULL; + HashHeader1 *header; - header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(HashHeader)); + header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header)); if(!header_bytes) { assert(PyErr_Occurred()); goto fail; @@ -299,11 +307,11 @@ hashindex_read(PyObject *file_py, int permit_compact) /* TypeError, not a bytes() object */ goto fail_decref_header; } - if(bytes_read != sizeof(HashHeader)) { + if(bytes_read != sizeof(*header)) { /* Truncated file */ /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */ PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)", - sizeof(HashHeader), bytes_read); + sizeof(*header), bytes_read); goto fail_decref_header; } @@ -334,7 +342,111 @@ hashindex_read(PyObject *file_py, int permit_compact) goto fail_decref_header; } - tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(HashHeader), SEEK_SET); + tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET); + Py_XDECREF(tmp); + if(PyErr_Occurred()) { + goto fail_decref_header; + } + + /* Set up the in-memory header */ + if(!(index = malloc(sizeof(HashIndex)))) { + PyErr_NoMemory(); + goto fail_decref_header; + } + + PyObject_GetBuffer(header_bytes, &header_buffer, PyBUF_SIMPLE); + if(PyErr_Occurred()) { + goto fail_free_index; + } + + header = (HashHeader1*) header_buffer.buf; + if(memcmp(header->magic, MAGIC1, MAGIC_LEN)) { + PyErr_Format(PyExc_ValueError, "Unknown MAGIC in header"); + goto fail_release_header_buffer; + } + + buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size); + if((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) { + PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)", + sizeof(*header) + buckets_length, length); + goto fail_release_header_buffer; + } + + index->num_entries = _le32toh(header->num_entries); + index->num_buckets = _le32toh(header->num_buckets); + index->num_empty = -1; // unknown, needs counting + index->key_size = header->key_size; + index->value_size = header->value_size; + +fail_release_header_buffer: + PyBuffer_Release(&header_buffer); +fail_free_index: + if(PyErr_Occurred()) { + free(index); + index = NULL; + } +fail_decref_header: + Py_DECREF(header_bytes); +fail: + return index; +} + +HashIndex * +read_hashheader(PyObject *file_py) +{ + Py_ssize_t bytes_read, length, buckets_length; + Py_buffer header_buffer; + PyObject *header_bytes, *length_object, *tmp; + HashIndex *index = NULL; + HashHeader *header; + + header_bytes = PyObject_CallMethod(file_py, "read", "n", (Py_ssize_t)sizeof(*header)); + if(!header_bytes) { + assert(PyErr_Occurred()); + goto fail; + } + + bytes_read = PyBytes_Size(header_bytes); + if(PyErr_Occurred()) { + /* TypeError, not a bytes() object */ + goto fail_decref_header; + } + if(bytes_read != sizeof(*header)) { + /* Truncated file */ + /* Note: %zd is the format for Py_ssize_t, %zu is for size_t */ + PyErr_Format(PyExc_ValueError, "Could not read header (expected %zu, but read %zd bytes)", + sizeof(*header), bytes_read); + goto fail_decref_header; + } + + /* + * Hash the header + * If the header is corrupted this bails before doing something stupid (like allocating 3.8 TB of memory) + */ + tmp = PyObject_CallMethod(file_py, "hash_part", "s", "HashHeader"); + Py_XDECREF(tmp); + if(PyErr_Occurred()) { + if(PyErr_ExceptionMatches(PyExc_AttributeError)) { + /* Be able to work with regular file objects which do not have a hash_part method. */ + PyErr_Clear(); + } else { + goto fail_decref_header; + } + } + + /* Find length of file */ + length_object = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)0, SEEK_END); + if(PyErr_Occurred()) { + goto fail_decref_header; + } + length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError); + Py_DECREF(length_object); + if(PyErr_Occurred()) { + /* This shouldn't generally happen; but can if seek() returns something that's not a number */ + goto fail_decref_header; + } + + tmp = PyObject_CallMethod(file_py, "seek", "ni", (Py_ssize_t)sizeof(*header), SEEK_SET); Py_XDECREF(tmp); if(PyErr_Occurred()) { goto fail_decref_header; @@ -357,17 +469,58 @@ hashindex_read(PyObject *file_py, int permit_compact) goto fail_release_header_buffer; } - buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * (header->key_size + header->value_size); - if((Py_ssize_t)length != (Py_ssize_t)sizeof(HashHeader) + buckets_length) { + buckets_length = (Py_ssize_t)_le32toh(header->num_buckets) * + (_le32toh(header->key_size) + _le32toh(header->value_size)); + if ((Py_ssize_t)length != (Py_ssize_t)sizeof(*header) + buckets_length) { PyErr_Format(PyExc_ValueError, "Incorrect file length (expected %zd, got %zd)", - sizeof(HashHeader) + buckets_length, length); + sizeof(*header) + buckets_length, length); goto fail_release_header_buffer; } index->num_entries = _le32toh(header->num_entries); index->num_buckets = _le32toh(header->num_buckets); - index->key_size = header->key_size; - index->value_size = header->value_size; + index->num_empty = _le32toh(header->num_empty); + index->key_size = _le32toh(header->key_size); + index->value_size = _le32toh(header->value_size); + + int header_version = _le32toh(header->version); + if (header_version != 2) { + PyErr_Format(PyExc_ValueError, "Unsupported header version (expected %d, got %d)", + 2, header_version); + goto fail_release_header_buffer; + } + +fail_release_header_buffer: + PyBuffer_Release(&header_buffer); +fail_free_index: + if(PyErr_Occurred()) { + free(index); + index = NULL; + } +fail_decref_header: + Py_DECREF(header_bytes); +fail: + return index; +} + +/* Public API */ + +#ifndef BORG_NO_PYTHON +static HashIndex * +hashindex_read(PyObject *file_py, int permit_compact, int legacy) +{ + Py_ssize_t buckets_length, bytes_read; + PyObject *bucket_bytes; + HashIndex *index = NULL; + + if (legacy) + index = read_hashheader1(file_py); + else + index = read_hashheader(file_py); + + if (!index) + goto fail; + index->bucket_size = index->key_size + index->value_size; index->lower_limit = get_lower_limit(index->num_buckets); index->upper_limit = get_upper_limit(index->num_buckets); @@ -381,10 +534,11 @@ hashindex_read(PyObject *file_py, int permit_compact) * will issue multiple underlying reads if necessary. This supports indices * >2 GB on Linux. We also compare lengths later. */ + buckets_length = (Py_ssize_t)(index->num_buckets) * (index->key_size + index->value_size); bucket_bytes = PyObject_CallMethod(file_py, "read", "n", buckets_length); if(!bucket_bytes) { assert(PyErr_Occurred()); - goto fail_release_header_buffer; + goto fail_free_index; } bytes_read = PyBytes_Size(bucket_bytes); if(PyErr_Occurred()) { @@ -404,7 +558,8 @@ hashindex_read(PyObject *file_py, int permit_compact) if(!permit_compact) { index->min_empty = get_min_empty(index->num_buckets); - index->num_empty = count_empty(index); + if (index->num_empty == -1) // we read a legacy index without num_empty value + index->num_empty = count_empty(index); if(index->num_empty < index->min_empty) { /* too many tombstones here / not enough empty buckets, do a same-size rebuild */ @@ -426,15 +581,11 @@ fail_free_buckets: } fail_decref_buckets: Py_DECREF(bucket_bytes); -fail_release_header_buffer: - PyBuffer_Release(&header_buffer); fail_free_index: if(PyErr_Occurred()) { free(index); index = NULL; } -fail_decref_header: - Py_DECREF(header_bytes); fail: return index; } @@ -481,33 +632,37 @@ hashindex_free(HashIndex *index) free(index); } -#ifndef BORG_NO_PYTHON -static void -hashindex_write(HashIndex *index, PyObject *file_py) +int +write_hashheader(HashIndex *index, PyObject *file_py) { - PyObject *length_object, *buckets_view, *tmp; + PyObject *length_object, *tmp; Py_ssize_t length; - Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size; + + _Static_assert(sizeof(HashHeader) == 1024, "HashHeader struct should be exactly 1024 bytes in size"); + HashHeader header = { .magic = MAGIC, + .version = _htole32(2), .num_entries = _htole32(index->num_entries), .num_buckets = _htole32(index->num_buckets), - .key_size = index->key_size, - .value_size = index->value_size + .num_empty = _htole32(index->num_empty), + .key_size = _htole32(index->key_size), + .value_size = _htole32(index->value_size), + .reserved = {0} }; - length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(HashHeader)); + length_object = PyObject_CallMethod(file_py, "write", "y#", &header, (Py_ssize_t)sizeof(header)); if(PyErr_Occurred()) { - return; + return 0; } length = PyNumber_AsSsize_t(length_object, PyExc_OverflowError); Py_DECREF(length_object); if(PyErr_Occurred()) { - return; + return 0; } - if(length != sizeof(HashHeader)) { + if(length != sizeof(header)) { PyErr_SetString(PyExc_ValueError, "Failed to write header"); - return; + return 0; } /* @@ -520,9 +675,24 @@ hashindex_write(HashIndex *index, PyObject *file_py) /* Be able to work with regular file objects which do not have a hash_part method. */ PyErr_Clear(); } else { - return; + return 0; } } + return 1; +} + +#ifndef BORG_NO_PYTHON +static void +hashindex_write(HashIndex *index, PyObject *file_py, int legacy) +{ + PyObject *length_object, *buckets_view; + Py_ssize_t length; + Py_ssize_t buckets_length = (Py_ssize_t)index->num_buckets * index->bucket_size; + + assert(!legacy); // we do not ever write legacy hashindexes + + if(!write_hashheader(index, file_py)) + return; /* Note: explicitly construct view; BuildValue can convert (pointer, length) to Python objects, but copies them for doing so */ buckets_view = PyMemoryView_FromMemory((char*)index->buckets, buckets_length, PyBUF_READ); @@ -698,6 +868,7 @@ hashindex_compact(HashIndex *index) } index->num_buckets = index->num_entries; + index->num_empty = 0; return saved_size; } diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index ded5005ab..f17ce62bd 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -17,12 +17,12 @@ cdef extern from "_hashindex.c": uint32_t version char hash[16] - HashIndex *hashindex_read(object file_py, int permit_compact) except * + HashIndex *hashindex_read(object file_py, int permit_compact, int legacy) except * HashIndex *hashindex_init(int capacity, int key_size, int value_size) void hashindex_free(HashIndex *index) int hashindex_len(HashIndex *index) int hashindex_size(HashIndex *index) - void hashindex_write(HashIndex *index, object file_py) except * + void hashindex_write(HashIndex *index, object file_py, int legacy) except * unsigned char *hashindex_get(HashIndex *index, unsigned char *key) unsigned char *hashindex_next_key(HashIndex *index, unsigned char *key) int hashindex_delete(HashIndex *index, unsigned char *key) @@ -75,21 +75,21 @@ assert _MAX_VALUE % 2 == 1 def hashindex_variant(fn): """peek into an index file and find out what it is""" with open(fn, 'rb') as f: - hh = f.read(18) # len(HashHeader) - magic = hh[0:8] + magic = f.read(8) # MAGIC_LEN if magic == b'BORG_IDX': - key_size = hh[16] - value_size = hh[17] - return f'k{key_size}_v{value_size}' + return 1 # legacy + if magic == b'BORG2IDX': + return 2 if magic == b'12345678': # used by unit tests - return 'k32_v16' # just return the current variant - raise ValueError(f'unknown hashindex format, magic: {magic!r}') + return 2 # just return the current variant + raise ValueError(f'unknown hashindex magic: {magic!r}') @cython.internal cdef class IndexBase: cdef HashIndex *index cdef int key_size + legacy = 0 _key_size = 32 @@ -101,9 +101,9 @@ cdef class IndexBase: if path: if isinstance(path, (str, bytes)): with open(path, 'rb') as fd: - self.index = hashindex_read(fd, permit_compact) + self.index = hashindex_read(fd, permit_compact, self.legacy) else: - self.index = hashindex_read(path, permit_compact) + self.index = hashindex_read(path, permit_compact, self.legacy) assert self.index, 'hashindex_read() returned NULL with no exception set' else: if usable is not None: @@ -123,9 +123,9 @@ cdef class IndexBase: def write(self, path): if isinstance(path, (str, bytes)): with open(path, 'wb') as fd: - hashindex_write(self.index, fd) + hashindex_write(self.index, fd, self.legacy) else: - hashindex_write(self.index, path) + hashindex_write(self.index, path, self.legacy) def clear(self): hashindex_free(self.index) @@ -314,6 +314,7 @@ cdef class NSKeyIterator: cdef class NSIndex1(IndexBase): # legacy borg 1.x + legacy = 1 value_size = 8 def __getitem__(self, key): diff --git a/src/borg/repository.py b/src/borg/repository.py index 477864482..be7a0c70f 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -543,9 +543,9 @@ def open_index(self, transaction_id, auto_recover=True): integrity_data = self._read_integrity(transaction_id, "index") try: with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: - if variant == "k32_v16": + if variant == 2: return NSIndex.read(fd) - if variant == "k32_v8": # legacy + if variant == 1: # legacy return NSIndex1.read(fd) except (ValueError, OSError, FileIntegrityError) as exc: logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc) diff --git a/src/borg/testsuite/cache.py b/src/borg/testsuite/cache.py index 3699bf3a7..f97b6e504 100644 --- a/src/borg/testsuite/cache.py +++ b/src/borg/testsuite/cache.py @@ -108,15 +108,21 @@ def test_corrupted_ancillary(self, index, sync, data, error): def make_index_with_refcount(self, refcount): index_data = io.BytesIO() - index_data.write(b"BORG_IDX") + index_data.write(b"BORG2IDX") + # version + index_data.write((2).to_bytes(4, "little")) # num_entries index_data.write((1).to_bytes(4, "little")) # num_buckets index_data.write((1).to_bytes(4, "little")) + # num_empty + index_data.write((0).to_bytes(4, "little")) # key_size - index_data.write((32).to_bytes(1, "little")) + index_data.write((32).to_bytes(4, "little")) # value_size - index_data.write((3 * 4).to_bytes(1, "little")) + index_data.write((3 * 4).to_bytes(4, "little")) + # reserved + index_data.write(bytes(1024 - 32)) index_data.write(H(0)) index_data.write(refcount.to_bytes(4, "little")) diff --git a/src/borg/testsuite/hashindex.py b/src/borg/testsuite/hashindex.py index ce2616dc6..d4fc908e0 100644 --- a/src/borg/testsuite/hashindex.py +++ b/src/borg/testsuite/hashindex.py @@ -86,12 +86,12 @@ def _generic_test(self, cls, make_value, sha): def test_nsindex(self): self._generic_test( - NSIndex, lambda x: (x, x, x), "7d70671d0b7e9d2f51b2691ecf35184b9f8ecc1202cceb2748c905c8fc04c256" + NSIndex, lambda x: (x, x, x), "0d7880dbe02b64f03c471e60e193a1333879b4f23105768b10c9222accfeac5e" ) def test_chunkindex(self): self._generic_test( - ChunkIndex, lambda x: (x, x), "85f72b036c692c8266e4f51ccf0cff2147204282b5e316ae508d30a448d88fef" + ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da" ) def test_resize(self): @@ -252,7 +252,7 @@ def test_chunk_indexer(self): class HashIndexSizeTestCase(BaseTestCase): def test_size_on_disk(self): idx = ChunkIndex() - assert idx.size() == 18 + 1031 * (32 + 2 * 4) + assert idx.size() == 1024 + 1031 * (32 + 2 * 4) def test_size_on_disk_accurate(self): idx = ChunkIndex() @@ -368,12 +368,12 @@ def test_keyerror(self): class HashIndexDataTestCase(BaseTestCase): - # This bytestring was created with borg2-pre 2022-06-10 + # This bytestring was created with borg2-pre 2022-09-30 HASHINDEX = ( - b"eJzt0LEJg1AYhdE/JqBjOEJMNhBBrAQrO9ewc+HsoG+CPMsEz1cfbnHbceqXoZvvEVE+IuoqMu2pnOE4" - b"juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4" - b"juM4juM4juM4jruie36vuSVT5N0rzW0n9t7r5z9+4TiO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO" - b"4ziO4ziO4ziO4ziO4ziO437LHbSVHGw=" + b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te" + b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM" + b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4" + b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk=" ) def _serialize_hashindex(self, idx): @@ -439,17 +439,23 @@ def test_integrity_checked_file(self): class HashIndexCompactTestCase(HashIndexDataTestCase): - def index(self, num_entries, num_buckets): + def index(self, num_entries, num_buckets, num_empty): index_data = io.BytesIO() - index_data.write(b"BORG_IDX") + index_data.write(b"BORG2IDX") + # version + index_data.write((2).to_bytes(4, "little")) # num_entries index_data.write(num_entries.to_bytes(4, "little")) # num_buckets index_data.write(num_buckets.to_bytes(4, "little")) + # num_empty + index_data.write(num_empty.to_bytes(4, "little")) # key_size - index_data.write((32).to_bytes(1, "little")) + index_data.write((32).to_bytes(4, "little")) # value_size - index_data.write((3 * 4).to_bytes(1, "little")) + index_data.write((3 * 4).to_bytes(4, "little")) + # reserved + index_data.write(bytes(1024 - 32)) self.index_data = index_data @@ -481,7 +487,7 @@ def write_deleted(self, key): self.write_entry(key, 0xFFFFFFFE, 0, 0) def test_simple(self): - self.index(num_entries=3, num_buckets=6) + self.index(num_entries=3, num_buckets=6, num_empty=2) self.write_entry(H2(0), 1, 2, 3) self.write_deleted(H2(1)) self.write_empty(H2(2)) @@ -491,14 +497,14 @@ def test_simple(self): compact_index = self.index_from_data_compact_to_data() - self.index(num_entries=3, num_buckets=3) + self.index(num_entries=3, num_buckets=3, num_empty=0) self.write_entry(H2(0), 1, 2, 3) self.write_entry(H2(3), 5, 6, 7) self.write_entry(H2(4), 8, 9, 10) assert compact_index == self.index_data.getvalue() def test_first_empty(self): - self.index(num_entries=3, num_buckets=6) + self.index(num_entries=3, num_buckets=6, num_empty=2) self.write_deleted(H2(1)) self.write_entry(H2(0), 1, 2, 3) self.write_empty(H2(2)) @@ -508,14 +514,14 @@ def test_first_empty(self): compact_index = self.index_from_data_compact_to_data() - self.index(num_entries=3, num_buckets=3) + self.index(num_entries=3, num_buckets=3, num_empty=0) self.write_entry(H2(0), 1, 2, 3) self.write_entry(H2(3), 5, 6, 7) self.write_entry(H2(4), 8, 9, 10) assert compact_index == self.index_data.getvalue() def test_last_used(self): - self.index(num_entries=3, num_buckets=6) + self.index(num_entries=3, num_buckets=6, num_empty=2) self.write_deleted(H2(1)) self.write_entry(H2(0), 1, 2, 3) self.write_empty(H2(2)) @@ -525,14 +531,14 @@ def test_last_used(self): compact_index = self.index_from_data_compact_to_data() - self.index(num_entries=3, num_buckets=3) + self.index(num_entries=3, num_buckets=3, num_empty=0) self.write_entry(H2(0), 1, 2, 3) self.write_entry(H2(3), 5, 6, 7) self.write_entry(H2(4), 8, 9, 10) assert compact_index == self.index_data.getvalue() def test_too_few_empty_slots(self): - self.index(num_entries=3, num_buckets=6) + self.index(num_entries=3, num_buckets=6, num_empty=2) self.write_deleted(H2(1)) self.write_entry(H2(0), 1, 2, 3) self.write_entry(H2(3), 5, 6, 7) @@ -542,14 +548,14 @@ def test_too_few_empty_slots(self): compact_index = self.index_from_data_compact_to_data() - self.index(num_entries=3, num_buckets=3) + self.index(num_entries=3, num_buckets=3, num_empty=0) self.write_entry(H2(0), 1, 2, 3) self.write_entry(H2(3), 5, 6, 7) self.write_entry(H2(4), 8, 9, 10) assert compact_index == self.index_data.getvalue() def test_empty(self): - self.index(num_entries=0, num_buckets=6) + self.index(num_entries=0, num_buckets=6, num_empty=3) self.write_deleted(H2(1)) self.write_empty(H2(0)) self.write_deleted(H2(3)) @@ -559,7 +565,7 @@ def test_empty(self): compact_index = self.index_from_data_compact_to_data() - self.index(num_entries=0, num_buckets=0) + self.index(num_entries=0, num_buckets=0, num_empty=0) assert compact_index == self.index_data.getvalue() def test_merge(self): @@ -569,7 +575,7 @@ def test_merge(self): idx1[H(2)] = 2, 200 idx1[H(3)] = 3, 300 idx1.compact() - assert idx1.size() == 18 + 3 * (32 + 2 * 4) + assert idx1.size() == 1024 + 3 * (32 + 2 * 4) master.merge(idx1) assert master[H(1)] == (1, 100) @@ -612,7 +618,7 @@ def HH(x, y, z): for y in range(700): # stay below max load to not trigger resize idx[HH(0, y, 0)] = (0, y, 0) - assert idx.size() == 1031 * 48 + 18 # 1031 buckets + header + assert idx.size() == 1024 + 1031 * 48 # header + 1031 buckets # delete lots of the collisions, creating lots of tombstones for y in range(400): # stay above min load to not trigger resize