1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2025-01-02 21:25:26 +00:00

ChunkIndex: use borghash.HashTableNT

Also:
- remove most hashindex tests, borghash has such tests
- have a small wrapper class ChunkIndex around HashTableNT to
  adapt API difference and add some special methods.

Note: I needed to manually copy the .pxd files from borghash
into cwd, because they were not found:
- ./borghash.pxd
- borghash/_borghash.pxd
This commit is contained in:
Thomas Waldmann 2024-10-24 23:35:32 +02:00
parent f7f2f23a7c
commit ad5b18008d
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
3 changed files with 60 additions and 456 deletions

View file

@ -3,9 +3,10 @@ from collections import namedtuple
cimport cython cimport cython
from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
from libc.string cimport memcpy from libc.string cimport memcpy
from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release
from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_CheckExact, PyBytes_GET_SIZE, PyBytes_AS_STRING from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_CheckExact, PyBytes_GET_SIZE, PyBytes_AS_STRING
from borghash cimport _borghash
API_VERSION = '1.2_01' API_VERSION = '1.2_01'
@ -349,109 +350,63 @@ cdef class NSKeyIterator1: # legacy borg 1.x
ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size') ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size')
cdef class ChunkIndex(IndexBase): class ChunkIndex:
""" """
Mapping of 32 byte keys to (refcount, size), which are all 32-bit unsigned. Mapping of 32 byte keys to (refcount, size), which are all 32-bit unsigned.
The reference count cannot overflow. If an overflow would occur, the refcount
is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
or add().
Prior signed 32-bit overflow is handled correctly for most cases: All values
from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
Other values are handled correctly. Note that previously the refcount could also reach
0 by *increasing* it.
Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
""" """
MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range
value_size = 8 def __init__(self, capacity=1000, path=None, permit_compact=False, usable=None):
if path:
def __getitem__(self, key): self.ht = _borghash.HashTableNT.read(path)
assert len(key) == self.key_size else:
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key) if usable is not None:
if not data: capacity = usable * 2 # load factor 0.5
raise KeyError(key) self.ht = _borghash.HashTableNT(key_size=32, value_format="<II", namedtuple_type=ChunkIndexEntry, capacity=capacity)
cdef uint32_t refcount = _le32toh(data[0])
assert refcount <= _MAX_VALUE, "invalid reference count"
return ChunkIndexEntry(refcount, _le32toh(data[1]))
def __setitem__(self, key, value): def __setitem__(self, key, value):
assert len(key) == self.key_size if not isinstance(value, ChunkIndexEntry) and isinstance(value, tuple):
cdef uint32_t[2] data value = ChunkIndexEntry(*value)
cdef uint32_t refcount = value[0] self.ht[key] = value
assert refcount <= _MAX_VALUE, "invalid reference count"
data[0] = _htole32(refcount) def __getitem__(self, key):
data[1] = _htole32(value[1]) return self.ht[key]
if not hashindex_set(self.index, <unsigned char *>key, data):
raise Exception('hashindex_set failed') def __delitem__(self, key):
del self.ht[key]
def __contains__(self, key): def __contains__(self, key):
assert len(key) == self.key_size return key in self.ht
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
if data != NULL:
assert _le32toh(data[0]) <= _MAX_VALUE, "invalid reference count"
return data != NULL
def iteritems(self, marker=None): def __len__(self):
cdef const unsigned char *key return len(self.ht)
iter = ChunkKeyIterator(self.key_size)
iter.idx = self def iteritems(self):
iter.index = self.index yield from self.ht.iteritems()
if marker:
key = hashindex_get(self.index, <unsigned char *>marker)
if marker is None:
raise IndexError
iter.key = key - self.key_size
return iter
def add(self, key, refs, size): def add(self, key, refs, size):
assert len(key) == self.key_size v = self.get(key, ChunkIndexEntry(0, 0))
cdef uint32_t[2] data refcount = min(self.MAX_VALUE, v.refcount + refs)
data[0] = _htole32(refs) self[key] = v._replace(refcount=refcount, size=size)
data[1] = _htole32(size)
self._add(<unsigned char*> key, data)
cdef _add(self, unsigned char *key, uint32_t *data): def get(self, key, default=None):
cdef uint64_t refcount1, refcount2, result64 try:
values = <uint32_t*> hashindex_get(self.index, key) return self[key]
if values: except KeyError:
refcount1 = _le32toh(values[0]) return default
refcount2 = _le32toh(data[0])
assert refcount1 <= _MAX_VALUE, "invalid reference count"
assert refcount2 <= _MAX_VALUE, "invalid reference count"
result64 = refcount1 + refcount2
values[0] = _htole32(min(result64, _MAX_VALUE))
values[1] = data[1]
else:
if not hashindex_set(self.index, key, data):
raise Exception('hashindex_set failed')
def compact(self):
pass
cdef class ChunkKeyIterator: def clear(self):
cdef ChunkIndex idx pass
cdef HashIndex *index
cdef const unsigned char *key
cdef int key_size
cdef int exhausted
def __cinit__(self, key_size): @classmethod
self.key = NULL def read(cls, path, permit_compact=False):
self.key_size = key_size return cls(path=path)
self.exhausted = 0
def __iter__(self): def write(self, path):
return self self.ht.write(path)
def __next__(self): def size(self):
if self.exhausted: return self.ht.size()
raise StopIteration
self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
if not self.key:
self.exhausted = 1
raise StopIteration
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
cdef uint32_t refcount = _le32toh(value[0])
assert refcount <= _MAX_VALUE, "invalid reference count"
return (<char *>self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]))

View file

@ -21,19 +21,13 @@
import time import time
from unittest import TestResult, TestSuite, defaultTestLoader from unittest import TestResult, TestSuite, defaultTestLoader
from .testsuite.hashindex_test import HashIndexDataTestCase, HashIndexRefcountingTestCase, HashIndexTestCase from .testsuite.hashindex_test import HashIndexRefcountingTestCase
from .testsuite.crypto_test import CryptoTestCase from .testsuite.crypto_test import CryptoTestCase
from .testsuite.chunker_test import ChunkerTestCase from .testsuite.chunker_test import ChunkerTestCase
SELFTEST_CASES = [ SELFTEST_CASES = [HashIndexRefcountingTestCase, CryptoTestCase, ChunkerTestCase]
HashIndexDataTestCase,
HashIndexRefcountingTestCase,
HashIndexTestCase,
CryptoTestCase,
ChunkerTestCase,
]
SELFTEST_COUNT = 19 SELFTEST_COUNT = 13
class SelfTestResult(TestResult): class SelfTestResult(TestResult):

View file

@ -1,16 +1,11 @@
# Note: these tests are part of the self test, do not use or import pytest functionality here. # Note: these tests are part of the self test, do not use or import pytest functionality here.
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT # See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
import base64
import hashlib import hashlib
import io import struct
import os
import tempfile
import zlib
from ..hashindex import NSIndex, ChunkIndex from ..hashindex import NSIndex, ChunkIndex
from ..crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError from . import BaseTestCase
from . import BaseTestCase, unopened_tempfile
def H(x): def H(x):
@ -23,361 +18,21 @@ def H2(x):
return hashlib.sha256(H(x)).digest() return hashlib.sha256(H(x)).digest()
class HashIndexTestCase(BaseTestCase):
def _generic_test(self, cls, make_value, sha):
idx = cls()
self.assert_equal(len(idx), 0)
# Test set
for x in range(100):
idx[H(x)] = make_value(x)
self.assert_equal(len(idx), 100)
for x in range(100):
self.assert_equal(idx[H(x)], make_value(x))
# Test update
for x in range(100):
idx[H(x)] = make_value(x * 2)
self.assert_equal(len(idx), 100)
for x in range(100):
self.assert_equal(idx[H(x)], make_value(x * 2))
# Test delete
for x in range(50):
del idx[H(x)]
# Test some keys still in there
for x in range(50, 100):
assert H(x) in idx
# Test some keys not there any more
for x in range(50):
assert H(x) not in idx
# Test delete non-existing key
for x in range(50):
self.assert_raises(KeyError, idx.__delitem__, H(x))
self.assert_equal(len(idx), 50)
with unopened_tempfile() as filepath:
idx.write(filepath)
del idx
# Verify file contents
with open(filepath, "rb") as fd:
self.assert_equal(hashlib.sha256(fd.read()).hexdigest(), sha)
# Make sure we can open the file
idx = cls.read(filepath)
self.assert_equal(len(idx), 50)
for x in range(50, 100):
self.assert_equal(idx[H(x)], make_value(x * 2))
idx.clear()
self.assert_equal(len(idx), 0)
idx.write(filepath)
del idx
self.assert_equal(len(cls.read(filepath)), 0)
idx = cls()
# Test setdefault - set non-existing key
idx.setdefault(H(0), make_value(42))
assert H(0) in idx
assert idx[H(0)] == make_value(42)
# Test setdefault - do not set existing key
idx.setdefault(H(0), make_value(23))
assert H(0) in idx
assert idx[H(0)] == make_value(42)
# Test setdefault - get-like return value, key not present
assert idx.setdefault(H(1), make_value(23)) == make_value(23)
# Test setdefault - get-like return value, key present
assert idx.setdefault(H(0), make_value(23)) == make_value(42)
# clean up setdefault test
del idx
def test_nsindex(self):
self._generic_test(
NSIndex, lambda x: (x, x, x), "640b909cf07884cc11fdf5431ffc27dee399770ceadecce31dffecd130a311a3"
)
def test_chunkindex(self):
self._generic_test(
ChunkIndex, lambda x: (x, x), "5915fcf986da12e5f3ac68e05242b9c729e6101b0460b1d4e4a9e9f7cdf1b7da"
)
def test_resize(self):
n = 2000 # Must be >= MIN_BUCKETS
with unopened_tempfile() as filepath:
idx = NSIndex()
idx.write(filepath)
initial_size = os.path.getsize(filepath)
self.assert_equal(len(idx), 0)
for x in range(n):
idx[H(x)] = x, x, x
idx.write(filepath)
assert initial_size < os.path.getsize(filepath)
for x in range(n):
del idx[H(x)]
self.assert_equal(len(idx), 0)
idx.write(filepath)
self.assert_equal(initial_size, os.path.getsize(filepath))
def test_iteritems(self):
idx = NSIndex()
for x in range(100):
idx[H(x)] = x, x, x
iterator = idx.iteritems()
all = list(iterator)
self.assert_equal(len(all), 100)
# iterator is already exhausted by list():
self.assert_raises(StopIteration, next, iterator)
second_half = list(idx.iteritems(marker=all[49][0]))
self.assert_equal(len(second_half), 50)
self.assert_equal(second_half, all[50:])
class HashIndexExtraTestCase(BaseTestCase):
"""These tests are separate because they should not become part of the selftest."""
def test_chunk_indexer(self):
# see _hashindex.c hash_sizes, we want to be close to the max. load
# because interesting errors happen there.
key_count = int(65537 * ChunkIndex.MAX_LOAD_FACTOR) - 10
index = ChunkIndex(key_count)
all_keys = [hashlib.sha256(H(k)).digest() for k in range(key_count)]
# we're gonna delete 1/3 of all_keys, so let's split them 2/3 and 1/3:
keys, to_delete_keys = all_keys[0 : (2 * key_count // 3)], all_keys[(2 * key_count // 3) :]
for i, key in enumerate(keys):
index[key] = (i, i)
for i, key in enumerate(to_delete_keys):
index[key] = (i, i)
for key in to_delete_keys:
del index[key]
for i, key in enumerate(keys):
assert index[key] == (i, i)
for key in to_delete_keys:
assert index.get(key) is None
# now delete every key still in the index
for key in keys:
del index[key]
# the index should now be empty
assert list(index.iteritems()) == []
class HashIndexSizeTestCase(BaseTestCase):
def test_size_on_disk(self):
idx = ChunkIndex()
assert idx.size() == 1024 + 1031 * (32 + 2 * 4)
def test_size_on_disk_accurate(self):
idx = ChunkIndex()
for i in range(1234):
idx[H(i)] = i, i**2
with unopened_tempfile() as filepath:
idx.write(filepath)
size = os.path.getsize(filepath)
assert idx.size() == size
class HashIndexRefcountingTestCase(BaseTestCase): class HashIndexRefcountingTestCase(BaseTestCase):
def test_chunkindex_add(self): def test_chunkindex_add(self):
idx1 = ChunkIndex() chunks = ChunkIndex()
idx1.add(H(1), 5, 6) x = H2(1)
assert idx1[H(1)] == (5, 6) chunks.add(x, 5, 6)
idx1.add(H(1), 1, 2) assert chunks[x] == (5, 6)
assert idx1[H(1)] == (6, 2) chunks.add(x, 1, 2)
assert chunks[x] == (6, 2)
def test_setitem_raises(self):
idx1 = ChunkIndex()
with self.assert_raises(AssertionError):
idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0
def test_keyerror(self): def test_keyerror(self):
idx = ChunkIndex() chunks = ChunkIndex()
with self.assert_raises(KeyError): with self.assert_raises(KeyError):
idx[H(1)] chunks[H(1)]
with self.assert_raises(OverflowError): with self.assert_raises(struct.error):
idx.add(H(1), -1, 0) chunks.add(H(1), -1, 0)
class HashIndexDataTestCase(BaseTestCase):
# This bytestring was created with borg2-pre 2022-09-30
HASHINDEX = (
b"eJzt0DEKgwAMQNFoBXsMj9DqDUQoToKTR3Hzwr2DZi+0HS19HwIZHhnST/OjHYeljIhLTl1FVDlN7te"
b"Q9M/tGcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHM"
b"dxHMdxHMdxHMdxHMdxHMdxHPfqbu+7F2nKz67Nc9sX97r1+Rt/4TiO4ziO4ziO4ziO4ziO4ziO4ziO4"
b"ziO4ziO4ziO4ziO4ziO4ziO4ziO4ziO487lDoRvHEk="
)
def _serialize_hashindex(self, idx):
with tempfile.TemporaryDirectory() as tempdir:
file = os.path.join(tempdir, "idx")
idx.write(file)
with open(file, "rb") as f:
return self._pack(f.read())
def _deserialize_hashindex(self, bytestring):
with tempfile.TemporaryDirectory() as tempdir:
file = os.path.join(tempdir, "idx")
with open(file, "wb") as f:
f.write(self._unpack(bytestring))
return ChunkIndex.read(file)
def _pack(self, bytestring):
return base64.b64encode(zlib.compress(bytestring))
def _unpack(self, bytestring):
return zlib.decompress(base64.b64decode(bytestring))
def test_identical_creation(self):
idx1 = ChunkIndex()
idx1[H(1)] = 1, 2
idx1[H(2)] = 2**31 - 1, 0
idx1[H(3)] = 4294962296, 0 # 4294962296 is -5000 interpreted as an uint32_t
serialized = self._serialize_hashindex(idx1)
assert self._unpack(serialized) == self._unpack(self.HASHINDEX)
class HashIndexIntegrityTestCase(HashIndexDataTestCase):
def write_integrity_checked_index(self, tempdir):
idx = self._deserialize_hashindex(self.HASHINDEX)
file = os.path.join(tempdir, "idx")
with IntegrityCheckedFile(path=file, write=True) as fd:
idx.write(fd)
integrity_data = fd.integrity_data
assert "final" in integrity_data
assert "HashHeader" in integrity_data
return file, integrity_data
def test_integrity_checked_file(self):
with tempfile.TemporaryDirectory() as tempdir:
file, integrity_data = self.write_integrity_checked_index(tempdir)
with open(file, "r+b") as fd:
fd.write(b"Foo")
with self.assert_raises(FileIntegrityError):
with IntegrityCheckedFile(path=file, write=False, integrity_data=integrity_data) as fd:
ChunkIndex.read(fd)
class HashIndexCompactTestCase(HashIndexDataTestCase):
def index(self, num_entries, num_buckets, num_empty):
index_data = io.BytesIO()
index_data.write(b"BORG2IDX")
# version
index_data.write((2).to_bytes(4, "little"))
# num_entries
index_data.write(num_entries.to_bytes(4, "little"))
# num_buckets
index_data.write(num_buckets.to_bytes(4, "little"))
# num_empty
index_data.write(num_empty.to_bytes(4, "little"))
# key_size
index_data.write((32).to_bytes(4, "little"))
# value_size
index_data.write((3 * 4).to_bytes(4, "little"))
# reserved
index_data.write(bytes(1024 - 32))
self.index_data = index_data
def index_from_data(self):
self.index_data.seek(0)
# Since we are trying to carefully control the layout of the hashindex,
# we set permit_compact to prevent hashindex_read from resizing the hash table.
index = ChunkIndex.read(self.index_data, permit_compact=True)
return index
def write_entry(self, key, *values):
self.index_data.write(key)
for value in values:
self.index_data.write(value.to_bytes(4, "little"))
def write_empty(self, key):
self.write_entry(key, 0xFFFFFFFF, 0, 0)
def write_deleted(self, key):
self.write_entry(key, 0xFFFFFFFE, 0, 0)
def compare_indexes(self, idx1, idx2):
"""Check that the two hash tables contain the same data. idx1
is allowed to have "mis-filed" entries, because we only need to
iterate over it. But idx2 needs to support lookup."""
for k, v in idx1.iteritems():
assert v == idx2[k]
assert len(idx1) == len(idx2)
def compare_compact(self, layout):
"""A generic test of a hashindex with the specified layout. layout should
be a string consisting only of the characters '*' (filled), 'D' (deleted)
and 'E' (empty).
"""
num_buckets = len(layout)
num_empty = layout.count("E")
num_entries = layout.count("*")
self.index(num_entries=num_entries, num_buckets=num_buckets, num_empty=num_empty)
k = 0
for c in layout:
if c == "D":
self.write_deleted(H2(k))
elif c == "E":
self.write_empty(H2(k))
else:
assert c == "*"
self.write_entry(H2(k), 3 * k + 1, 3 * k + 2, 3 * k + 3)
k += 1
idx = self.index_from_data()
cpt = self.index_from_data()
cpt.compact()
# Note that idx is not a valid hash table, since the entries are not
# stored where they should be. So lookups of the form idx[k] can fail.
# But cpt is a valid hash table, since there are no empty buckets.
assert idx.size() == 1024 + num_buckets * (32 + 3 * 4)
assert cpt.size() == 1024 + num_entries * (32 + 3 * 4)
self.compare_indexes(idx, cpt)
def test_simple(self):
self.compare_compact("*DE**E")
def test_first_empty(self):
self.compare_compact("D*E**E")
def test_last_used(self):
self.compare_compact("D*E*E*")
def test_too_few_empty_slots(self):
self.compare_compact("D**EE*")
def test_empty(self):
self.compare_compact("DEDEED")
def test_num_buckets_zero(self):
self.compare_compact("")
def test_already_compact(self):
self.compare_compact("***")
def test_all_at_front(self):
self.compare_compact("*DEEED")
self.compare_compact("**DEED")
self.compare_compact("***EED")
self.compare_compact("****ED")
self.compare_compact("*****D")
def test_all_at_back(self):
self.compare_compact("EDEEE*")
self.compare_compact("DEDE**")
self.compare_compact("DED***")
self.compare_compact("ED****")
self.compare_compact("D*****")
class NSIndexTestCase(BaseTestCase):
def test_nsindex_segment_limit(self):
idx = NSIndex()
with self.assert_raises(AssertionError):
idx[H(1)] = NSIndex.MAX_VALUE + 1, 0, 0
assert H(1) not in idx
idx[H(2)] = NSIndex.MAX_VALUE, 0, 0
assert H(2) in idx
class AllIndexTestCase(BaseTestCase):
def test_max_load_factor(self):
assert NSIndex.MAX_LOAD_FACTOR < 1.0
assert ChunkIndex.MAX_LOAD_FACTOR < 1.0
class IndexCorruptionTestCase(BaseTestCase): class IndexCorruptionTestCase(BaseTestCase):