diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index 6e9404f10..d4afe5f45 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -3,9 +3,10 @@ from collections import namedtuple cimport cython from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t from libc.string cimport memcpy -from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_CheckExact, PyBytes_GET_SIZE, PyBytes_AS_STRING +from borghash cimport _borghash + API_VERSION = '1.2_01' @@ -349,109 +350,63 @@ cdef class NSKeyIterator1: # legacy borg 1.x ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size') -cdef class ChunkIndex(IndexBase): +class ChunkIndex: """ Mapping of 32 byte keys to (refcount, size), which are all 32-bit unsigned. - - The reference count cannot overflow. If an overflow would occur, the refcount - is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref() - or add(). - - Prior signed 32-bit overflow is handled correctly for most cases: All values - from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either - cause silent data loss (-1, -2) or will raise an AssertionError when accessed. - Other values are handled correctly. Note that previously the refcount could also reach - 0 by *increasing* it. - - Assigning refcounts in this reserved range is an invalid operation and raises AssertionError. """ + MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range - value_size = 8 - - def __getitem__(self, key): - assert len(key) == self.key_size - data = hashindex_get(self.index, key) - if not data: - raise KeyError(key) - cdef uint32_t refcount = _le32toh(data[0]) - assert refcount <= _MAX_VALUE, "invalid reference count" - return ChunkIndexEntry(refcount, _le32toh(data[1])) + def __init__(self, capacity=1000, path=None, permit_compact=False, usable=None): + if path: + self.ht = _borghash.HashTableNT.read(path) + else: + if usable is not None: + capacity = usable * 2 # load factor 0.5 + self.ht = _borghash.HashTableNT(key_size=32, value_format="key, data): - raise Exception('hashindex_set failed') + if not isinstance(value, ChunkIndexEntry) and isinstance(value, tuple): + value = ChunkIndexEntry(*value) + self.ht[key] = value + + def __getitem__(self, key): + return self.ht[key] + + def __delitem__(self, key): + del self.ht[key] def __contains__(self, key): - assert len(key) == self.key_size - data = hashindex_get(self.index, key) - if data != NULL: - assert _le32toh(data[0]) <= _MAX_VALUE, "invalid reference count" - return data != NULL + return key in self.ht - def iteritems(self, marker=None): - cdef const unsigned char *key - iter = ChunkKeyIterator(self.key_size) - iter.idx = self - iter.index = self.index - if marker: - key = hashindex_get(self.index, marker) - if marker is None: - raise IndexError - iter.key = key - self.key_size - return iter + def __len__(self): + return len(self.ht) + + def iteritems(self): + yield from self.ht.iteritems() def add(self, key, refs, size): - assert len(key) == self.key_size - cdef uint32_t[2] data - data[0] = _htole32(refs) - data[1] = _htole32(size) - self._add( key, data) + v = self.get(key, ChunkIndexEntry(0, 0)) + refcount = min(self.MAX_VALUE, v.refcount + refs) + self[key] = v._replace(refcount=refcount, size=size) - cdef _add(self, unsigned char *key, uint32_t *data): - cdef uint64_t refcount1, refcount2, result64 - values = hashindex_get(self.index, key) - if values: - refcount1 = _le32toh(values[0]) - refcount2 = _le32toh(data[0]) - assert refcount1 <= _MAX_VALUE, "invalid reference count" - assert refcount2 <= _MAX_VALUE, "invalid reference count" - result64 = refcount1 + refcount2 - values[0] = _htole32(min(result64, _MAX_VALUE)) - values[1] = data[1] - else: - if not hashindex_set(self.index, key, data): - raise Exception('hashindex_set failed') + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + def compact(self): + pass -cdef class ChunkKeyIterator: - cdef ChunkIndex idx - cdef HashIndex *index - cdef const unsigned char *key - cdef int key_size - cdef int exhausted + def clear(self): + pass - def __cinit__(self, key_size): - self.key = NULL - self.key_size = key_size - self.exhausted = 0 + @classmethod + def read(cls, path, permit_compact=False): + return cls(path=path) - def __iter__(self): - return self + def write(self, path): + self.ht.write(path) - def __next__(self): - if self.exhausted: - raise StopIteration - self.key = hashindex_next_key(self.index, self.key) - if not self.key: - self.exhausted = 1 - raise StopIteration - cdef uint32_t *value = (self.key + self.key_size) - cdef uint32_t refcount = _le32toh(value[0]) - assert refcount <= _MAX_VALUE, "invalid reference count" - return (self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1])) + def size(self): + return self.ht.size() diff --git a/src/borg/selftest.py b/src/borg/selftest.py index cb1ef7a8c..7c5d4e2c6 100644 --- a/src/borg/selftest.py +++ b/src/borg/selftest.py @@ -21,19 +21,13 @@ import time from unittest import TestResult, TestSuite, defaultTestLoader -from .testsuite.hashindex_test import HashIndexDataTestCase, HashIndexRefcountingTestCase, HashIndexTestCase +from .testsuite.hashindex_test import HashIndexRefcountingTestCase from .testsuite.crypto_test import CryptoTestCase from .testsuite.chunker_test import ChunkerTestCase -SELFTEST_CASES = [ - HashIndexDataTestCase, - HashIndexRefcountingTestCase, - HashIndexTestCase, - CryptoTestCase, - ChunkerTestCase, -] +SELFTEST_CASES = [HashIndexRefcountingTestCase, CryptoTestCase, ChunkerTestCase] -SELFTEST_COUNT = 19 +SELFTEST_COUNT = 13 class SelfTestResult(TestResult): diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py index 6bf3a2ab1..c85ecbd31 100644 --- a/src/borg/testsuite/hashindex_test.py +++ b/src/borg/testsuite/hashindex_test.py @@ -1,16 +1,11 @@ # Note: these tests are part of the self test, do not use or import pytest functionality here. # See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT -import base64 import hashlib -import io -import os -import tempfile -import zlib +import struct from ..hashindex import NSIndex, ChunkIndex -from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError -from . import BaseTestCase, unopened_tempfile +from . import BaseTestCase def H(x): @@ -23,361 +18,21 @@ def H2(x): return hashlib.sha256(H(x)).digest() -class HashIndexTestCase(BaseTestCase): - def _generic_test(self, cls, make_value, sha): - idx = cls() - self.assert_equal(len(idx), 0) - # Test set - for x in range(100): - idx[H(x)] = make_value(x) - self.assert_equal(len(idx), 100) - for x in range(100): - self.assert_equal(idx[H(x)], make_value(x)) - # Test update - for x in range(100): - idx[H(x)] = make_value(x * 2) - self.assert_equal(len(idx), 100) - for x in range(100): - self.assert_equal(idx[H(x)], make_value(x * 2)) - # Test delete - for x in range(50): - del idx[H(x)] - # Test some keys still in there - for x in range(50, 100): - assert H(x) in idx - # Test some keys not there any more - for x in range(50): - assert H(x) not in idx - # Test delete non-existing key - for x in range(50): - self.assert_raises(KeyError, idx.__delitem__, H(x)) - self.assert_equal(len(idx), 50) - with unopened_tempfile() as filepath: - idx.write(filepath) - del idx - # Verify file contents - with open(filepath, "rb") as fd: - self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha) - # Make sure we can open the file - idx = cls.read(filepath) - self.assert_equal(len(idx), 50) - for x in range(50, 100): - self.assert_equal(idx[H(x)], make_value(x * 2)) - idx.clear() - self.assert_equal(len(idx), 0) - idx.write(filepath) - del idx - self.assert_equal(len(cls.read(filepath)), 0) - idx = cls() - # Test setdefault - set non-existing key - idx.setdefault(H(0), make_value(42)) - assert H(0) in idx - assert idx[H(0)] == make_value(42) - # Test setdefault - do not set existing key - idx.setdefault(H(0), make_value(23)) - assert H(0) in idx - assert idx[H(0)] == make_value(42) - # Test setdefault - get-like return value, key not present - assert idx.setdefault(H(1), make_value(23)) == make_value(23) - # Test setdefault - get-like return value, key present - assert idx.setdefault(H(0), make_value(23)) == make_value(42) - # clean up setdefault test - del idx - - def test_nsindex(self): - self._generic_test( - NSIndex, lambda x: (x, x, x), "640b909cf07884cc11fdf5431ffc27dee399770ceadecce31dffecd130a311a3" - ) - - def test_chunkindex(self): - self._generic_test( - ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da" - ) - - def test_resize(self): - n = 2000 # Must be >= MIN_BUCKETS - with unopened_tempfile() as filepath: - idx = NSIndex() - idx.write(filepath) - initial_size = os.path.getsize(filepath) - self.assert_equal(len(idx), 0) - for x in range(n): - idx[H(x)] = x, x, x - idx.write(filepath) - assert initial_size < os.path.getsize(filepath) - for x in range(n): - del idx[H(x)] - self.assert_equal(len(idx), 0) - idx.write(filepath) - self.assert_equal(initial_size, os.path.getsize(filepath)) - - def test_iteritems(self): - idx = NSIndex() - for x in range(100): - idx[H(x)] = x, x, x - iterator = idx.iteritems() - all = list(iterator) - self.assert_equal(len(all), 100) - # iterator is already exhausted by list(): - self.assert_raises(StopIteration, next, iterator) - second_half = list(idx.iteritems(marker=all[49][0])) - self.assert_equal(len(second_half), 50) - self.assert_equal(second_half, all[50:]) - - -class HashIndexExtraTestCase(BaseTestCase): - """These tests are separate because they should not become part of the selftest.""" - - def test_chunk_indexer(self): - # see _hashindex.c hash_sizes, we want to be close to the max. load - # because interesting errors happen there. - key_count = int(65537 * ChunkIndex.MAX_LOAD_FACTOR) - 10 - index = ChunkIndex(key_count) - all_keys = [hashlib.sha256(H(k)).digest() for k in range(key_count)] - # we're gonna delete 1/3 of all_keys, so let's split them 2/3 and 1/3: - keys, to_delete_keys = all_keys[0 : (2 * key_count // 3)], all_keys[(2 * key_count // 3) :] - - for i, key in enumerate(keys): - index[key] = (i, i) - for i, key in enumerate(to_delete_keys): - index[key] = (i, i) - - for key in to_delete_keys: - del index[key] - for i, key in enumerate(keys): - assert index[key] == (i, i) - for key in to_delete_keys: - assert index.get(key) is None - - # now delete every key still in the index - for key in keys: - del index[key] - # the index should now be empty - assert list(index.iteritems()) == [] - - -class HashIndexSizeTestCase(BaseTestCase): - def test_size_on_disk(self): - idx = ChunkIndex() - assert idx.size() == 1024 + 1031 * (32 + 2 * 4) - - def test_size_on_disk_accurate(self): - idx = ChunkIndex() - for i in range(1234): - idx[H(i)] = i, i**2 - with unopened_tempfile() as filepath: - idx.write(filepath) - size = os.path.getsize(filepath) - assert idx.size() == size - - class HashIndexRefcountingTestCase(BaseTestCase): def test_chunkindex_add(self): - idx1 = ChunkIndex() - idx1.add(H(1), 5, 6) - assert idx1[H(1)] == (5, 6) - idx1.add(H(1), 1, 2) - assert idx1[H(1)] == (6, 2) - - def test_setitem_raises(self): - idx1 = ChunkIndex() - with self.assert_raises(AssertionError): - idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0 + chunks = ChunkIndex() + x = H2(1) + chunks.add(x, 5, 6) + assert chunks[x] == (5, 6) + chunks.add(x, 1, 2) + assert chunks[x] == (6, 2) def test_keyerror(self): - idx = ChunkIndex() + chunks = ChunkIndex() with self.assert_raises(KeyError): - idx[H(1)] - with self.assert_raises(OverflowError): - idx.add(H(1), -1, 0) - - -class HashIndexDataTestCase(BaseTestCase): - # This bytestring was created with borg2-pre 2022-09-30 - HASHINDEX = ( - b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te" - b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM" - b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4" - b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk=" - ) - - def _serialize_hashindex(self, idx): - with tempfile.TemporaryDirectory() as tempdir: - file = os.path.join(tempdir, "idx") - idx.write(file) - with open(file, "rb") as f: - return self._pack(f.read()) - - def _deserialize_hashindex(self, bytestring): - with tempfile.TemporaryDirectory() as tempdir: - file = os.path.join(tempdir, "idx") - with open(file, "wb") as f: - f.write(self._unpack(bytestring)) - return ChunkIndex.read(file) - - def _pack(self, bytestring): - return base64.b64encode(zlib.compress(bytestring)) - - def _unpack(self, bytestring): - return zlib.decompress(base64.b64decode(bytestring)) - - def test_identical_creation(self): - idx1 = ChunkIndex() - idx1[H(1)] = 1, 2 - idx1[H(2)] = 2**31 - 1, 0 - idx1[H(3)] = 4294962296, 0 # 4294962296 is -5000 interpreted as an uint32_t - - serialized = self._serialize_hashindex(idx1) - assert self._unpack(serialized) == self._unpack(self.HASHINDEX) - - -class HashIndexIntegrityTestCase(HashIndexDataTestCase): - def write_integrity_checked_index(self, tempdir): - idx = self._deserialize_hashindex(self.HASHINDEX) - file = os.path.join(tempdir, "idx") - with IntegrityCheckedFile(path=file, write=True) as fd: - idx.write(fd) - integrity_data = fd.integrity_data - assert "final" in integrity_data - assert "HashHeader" in integrity_data - return file, integrity_data - - def test_integrity_checked_file(self): - with tempfile.TemporaryDirectory() as tempdir: - file, integrity_data = self.write_integrity_checked_index(tempdir) - with open(file, "r+b") as fd: - fd.write(b"Foo") - with self.assert_raises(FileIntegrityError): - with IntegrityCheckedFile(path=file, write=False, integrity_data=integrity_data) as fd: - ChunkIndex.read(fd) - - -class HashIndexCompactTestCase(HashIndexDataTestCase): - def index(self, num_entries, num_buckets, num_empty): - index_data = io.BytesIO() - index_data.write(b"BORG2IDX") - # version - index_data.write((2).to_bytes(4, "little")) - # num_entries - index_data.write(num_entries.to_bytes(4, "little")) - # num_buckets - index_data.write(num_buckets.to_bytes(4, "little")) - # num_empty - index_data.write(num_empty.to_bytes(4, "little")) - # key_size - index_data.write((32).to_bytes(4, "little")) - # value_size - index_data.write((3 * 4).to_bytes(4, "little")) - # reserved - index_data.write(bytes(1024 - 32)) - - self.index_data = index_data - - def index_from_data(self): - self.index_data.seek(0) - # Since we are trying to carefully control the layout of the hashindex, - # we set permit_compact to prevent hashindex_read from resizing the hash table. - index = ChunkIndex.read(self.index_data, permit_compact=True) - return index - - def write_entry(self, key, *values): - self.index_data.write(key) - for value in values: - self.index_data.write(value.to_bytes(4, "little")) - - def write_empty(self, key): - self.write_entry(key, 0xFFFFFFFF, 0, 0) - - def write_deleted(self, key): - self.write_entry(key, 0xFFFFFFFE, 0, 0) - - def compare_indexes(self, idx1, idx2): - """Check that the two hash tables contain the same data. idx1 - is allowed to have "mis-filed" entries, because we only need to - iterate over it. But idx2 needs to support lookup.""" - for k, v in idx1.iteritems(): - assert v == idx2[k] - assert len(idx1) == len(idx2) - - def compare_compact(self, layout): - """A generic test of a hashindex with the specified layout. layout should - be a string consisting only of the characters '*' (filled), 'D' (deleted) - and 'E' (empty). - """ - num_buckets = len(layout) - num_empty = layout.count("E") - num_entries = layout.count("*") - self.index(num_entries=num_entries, num_buckets=num_buckets, num_empty=num_empty) - k = 0 - for c in layout: - if c == "D": - self.write_deleted(H2(k)) - elif c == "E": - self.write_empty(H2(k)) - else: - assert c == "*" - self.write_entry(H2(k), 3 * k + 1, 3 * k + 2, 3 * k + 3) - k += 1 - idx = self.index_from_data() - cpt = self.index_from_data() - cpt.compact() - # Note that idx is not a valid hash table, since the entries are not - # stored where they should be. So lookups of the form idx[k] can fail. - # But cpt is a valid hash table, since there are no empty buckets. - assert idx.size() == 1024 + num_buckets * (32 + 3 * 4) - assert cpt.size() == 1024 + num_entries * (32 + 3 * 4) - self.compare_indexes(idx, cpt) - - def test_simple(self): - self.compare_compact("*DE**E") - - def test_first_empty(self): - self.compare_compact("D*E**E") - - def test_last_used(self): - self.compare_compact("D*E*E*") - - def test_too_few_empty_slots(self): - self.compare_compact("D**EE*") - - def test_empty(self): - self.compare_compact("DEDEED") - - def test_num_buckets_zero(self): - self.compare_compact("") - - def test_already_compact(self): - self.compare_compact("***") - - def test_all_at_front(self): - self.compare_compact("*DEEED") - self.compare_compact("**DEED") - self.compare_compact("***EED") - self.compare_compact("****ED") - self.compare_compact("*****D") - - def test_all_at_back(self): - self.compare_compact("EDEEE*") - self.compare_compact("DEDE**") - self.compare_compact("DED***") - self.compare_compact("ED****") - self.compare_compact("D*****") - - -class NSIndexTestCase(BaseTestCase): - def test_nsindex_segment_limit(self): - idx = NSIndex() - with self.assert_raises(AssertionError): - idx[H(1)] = NSIndex.MAX_VALUE + 1, 0, 0 - assert H(1) not in idx - idx[H(2)] = NSIndex.MAX_VALUE, 0, 0 - assert H(2) in idx - - -class AllIndexTestCase(BaseTestCase): - def test_max_load_factor(self): - assert NSIndex.MAX_LOAD_FACTOR < 1.0 - assert ChunkIndex.MAX_LOAD_FACTOR < 1.0 + chunks[H(1)] + with self.assert_raises(struct.error): + chunks.add(H(1), -1, 0) class IndexCorruptionTestCase(BaseTestCase):