mirror of https://github.com/borgbackup/borg.git
Merge pull request #1581 from enkore/issue/1580
hashindex: export max load factor to Python-space
This commit is contained in:
commit
3714be0d79
|
@ -854,8 +854,9 @@ class ArchiveChecker:
|
||||||
"""Fetch a list of all object keys from repository
|
"""Fetch a list of all object keys from repository
|
||||||
"""
|
"""
|
||||||
# Explicitly set the initial hash table capacity to avoid performance issues
|
# Explicitly set the initial hash table capacity to avoid performance issues
|
||||||
# due to hash table "resonance"
|
# due to hash table "resonance".
|
||||||
capacity = int(len(self.repository) * 1.35 + 1) # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c)
|
# Since reconstruction of archive items can add some new chunks, add 10 % headroom
|
||||||
|
capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR * 1.1)
|
||||||
self.chunks = ChunkIndex(capacity)
|
self.chunks = ChunkIndex(capacity)
|
||||||
marker = None
|
marker = None
|
||||||
while True:
|
while True:
|
||||||
|
|
|
@ -23,6 +23,8 @@ cdef extern from "_hashindex.c":
|
||||||
uint32_t _htole32(uint32_t v)
|
uint32_t _htole32(uint32_t v)
|
||||||
uint32_t _le32toh(uint32_t v)
|
uint32_t _le32toh(uint32_t v)
|
||||||
|
|
||||||
|
double HASH_MAX_LOAD
|
||||||
|
|
||||||
|
|
||||||
cdef _NoDefault = object()
|
cdef _NoDefault = object()
|
||||||
|
|
||||||
|
@ -45,7 +47,6 @@ assert UINT32_MAX == 2**32-1
|
||||||
|
|
||||||
# module-level constant because cdef's in classes can't have default values
|
# module-level constant because cdef's in classes can't have default values
|
||||||
cdef uint32_t _MAX_VALUE = 2**32-1025
|
cdef uint32_t _MAX_VALUE = 2**32-1025
|
||||||
MAX_VALUE = _MAX_VALUE
|
|
||||||
|
|
||||||
assert _MAX_VALUE % 2 == 1
|
assert _MAX_VALUE % 2 == 1
|
||||||
|
|
||||||
|
@ -54,6 +55,9 @@ cdef class IndexBase:
|
||||||
cdef HashIndex *index
|
cdef HashIndex *index
|
||||||
cdef int key_size
|
cdef int key_size
|
||||||
|
|
||||||
|
MAX_LOAD_FACTOR = HASH_MAX_LOAD
|
||||||
|
MAX_VALUE = _MAX_VALUE
|
||||||
|
|
||||||
def __cinit__(self, capacity=0, path=None, key_size=32):
|
def __cinit__(self, capacity=0, path=None, key_size=32):
|
||||||
self.key_size = key_size
|
self.key_size = key_size
|
||||||
if path:
|
if path:
|
||||||
|
@ -280,7 +284,7 @@ cdef class ChunkIndex(IndexBase):
|
||||||
unique_chunks += 1
|
unique_chunks += 1
|
||||||
values = <uint32_t*> (key + self.key_size)
|
values = <uint32_t*> (key + self.key_size)
|
||||||
refcount = _le32toh(values[0])
|
refcount = _le32toh(values[0])
|
||||||
assert refcount <= MAX_VALUE, "invalid reference count"
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
||||||
chunks += refcount
|
chunks += refcount
|
||||||
unique_size += _le32toh(values[1])
|
unique_size += _le32toh(values[1])
|
||||||
unique_csize += _le32toh(values[2])
|
unique_csize += _le32toh(values[2])
|
||||||
|
@ -340,5 +344,5 @@ cdef class ChunkKeyIterator:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
|
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
|
||||||
cdef uint32_t refcount = _le32toh(value[0])
|
cdef uint32_t refcount = _le32toh(value[0])
|
||||||
assert refcount <= MAX_VALUE, "invalid reference count"
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
||||||
return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))
|
return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))
|
||||||
|
|
|
@ -124,16 +124,16 @@ class HashIndexTestCase(BaseTestCase):
|
||||||
class HashIndexRefcountingTestCase(BaseTestCase):
|
class HashIndexRefcountingTestCase(BaseTestCase):
|
||||||
def test_chunkindex_limit(self):
|
def test_chunkindex_limit(self):
|
||||||
idx = ChunkIndex()
|
idx = ChunkIndex()
|
||||||
idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2
|
idx[H(1)] = ChunkIndex.MAX_VALUE - 1, 1, 2
|
||||||
|
|
||||||
# 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited
|
# 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
# first incref to move it to the limit
|
# first incref to move it to the limit
|
||||||
refcount, *_ = idx.incref(H(1))
|
refcount, *_ = idx.incref(H(1))
|
||||||
assert refcount == hashindex.MAX_VALUE
|
assert refcount == ChunkIndex.MAX_VALUE
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
refcount, *_ = idx.decref(H(1))
|
refcount, *_ = idx.decref(H(1))
|
||||||
assert refcount == hashindex.MAX_VALUE
|
assert refcount == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def _merge(self, refcounta, refcountb):
|
def _merge(self, refcounta, refcountb):
|
||||||
def merge(refcount1, refcount2):
|
def merge(refcount1, refcount2):
|
||||||
|
@ -152,23 +152,23 @@ class HashIndexRefcountingTestCase(BaseTestCase):
|
||||||
def test_chunkindex_merge_limit1(self):
|
def test_chunkindex_merge_limit1(self):
|
||||||
# Check that it does *not* limit at MAX_VALUE - 1
|
# Check that it does *not* limit at MAX_VALUE - 1
|
||||||
# (MAX_VALUE is odd)
|
# (MAX_VALUE is odd)
|
||||||
half = hashindex.MAX_VALUE // 2
|
half = ChunkIndex.MAX_VALUE // 2
|
||||||
assert self._merge(half, half) == hashindex.MAX_VALUE - 1
|
assert self._merge(half, half) == ChunkIndex.MAX_VALUE - 1
|
||||||
|
|
||||||
def test_chunkindex_merge_limit2(self):
|
def test_chunkindex_merge_limit2(self):
|
||||||
# 3000000000 + 2000000000 > MAX_VALUE
|
# 3000000000 + 2000000000 > MAX_VALUE
|
||||||
assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE
|
assert self._merge(3000000000, 2000000000) == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def test_chunkindex_merge_limit3(self):
|
def test_chunkindex_merge_limit3(self):
|
||||||
# Crossover point: both addition and limit semantics will yield the same result
|
# Crossover point: both addition and limit semantics will yield the same result
|
||||||
half = hashindex.MAX_VALUE // 2
|
half = ChunkIndex.MAX_VALUE // 2
|
||||||
assert self._merge(half + 1, half) == hashindex.MAX_VALUE
|
assert self._merge(half + 1, half) == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def test_chunkindex_merge_limit4(self):
|
def test_chunkindex_merge_limit4(self):
|
||||||
# Beyond crossover, result of addition would be 2**31
|
# Beyond crossover, result of addition would be 2**31
|
||||||
half = hashindex.MAX_VALUE // 2
|
half = ChunkIndex.MAX_VALUE // 2
|
||||||
assert self._merge(half + 2, half) == hashindex.MAX_VALUE
|
assert self._merge(half + 2, half) == ChunkIndex.MAX_VALUE
|
||||||
assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE
|
assert self._merge(half + 1, half + 1) == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def test_chunkindex_add(self):
|
def test_chunkindex_add(self):
|
||||||
idx1 = ChunkIndex()
|
idx1 = ChunkIndex()
|
||||||
|
@ -179,17 +179,17 @@ class HashIndexRefcountingTestCase(BaseTestCase):
|
||||||
|
|
||||||
def test_incref_limit(self):
|
def test_incref_limit(self):
|
||||||
idx1 = ChunkIndex()
|
idx1 = ChunkIndex()
|
||||||
idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7)
|
idx1[H(1)] = (ChunkIndex.MAX_VALUE, 6, 7)
|
||||||
idx1.incref(H(1))
|
idx1.incref(H(1))
|
||||||
refcount, *_ = idx1[H(1)]
|
refcount, *_ = idx1[H(1)]
|
||||||
assert refcount == hashindex.MAX_VALUE
|
assert refcount == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def test_decref_limit(self):
|
def test_decref_limit(self):
|
||||||
idx1 = ChunkIndex()
|
idx1 = ChunkIndex()
|
||||||
idx1[H(1)] = hashindex.MAX_VALUE, 6, 7
|
idx1[H(1)] = ChunkIndex.MAX_VALUE, 6, 7
|
||||||
idx1.decref(H(1))
|
idx1.decref(H(1))
|
||||||
refcount, *_ = idx1[H(1)]
|
refcount, *_ = idx1[H(1)]
|
||||||
assert refcount == hashindex.MAX_VALUE
|
assert refcount == ChunkIndex.MAX_VALUE
|
||||||
|
|
||||||
def test_decref_zero(self):
|
def test_decref_zero(self):
|
||||||
idx1 = ChunkIndex()
|
idx1 = ChunkIndex()
|
||||||
|
@ -209,7 +209,7 @@ class HashIndexRefcountingTestCase(BaseTestCase):
|
||||||
def test_setitem_raises(self):
|
def test_setitem_raises(self):
|
||||||
idx1 = ChunkIndex()
|
idx1 = ChunkIndex()
|
||||||
with pytest.raises(AssertionError):
|
with pytest.raises(AssertionError):
|
||||||
idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0
|
idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0, 0
|
||||||
|
|
||||||
def test_keyerror(self):
|
def test_keyerror(self):
|
||||||
idx = ChunkIndex()
|
idx = ChunkIndex()
|
||||||
|
@ -266,13 +266,18 @@ class HashIndexDataTestCase(BaseTestCase):
|
||||||
idx2 = ChunkIndex()
|
idx2 = ChunkIndex()
|
||||||
idx2[H(3)] = 2**32 - 123456, 6, 7
|
idx2[H(3)] = 2**32 - 123456, 6, 7
|
||||||
idx1.merge(idx2)
|
idx1.merge(idx2)
|
||||||
assert idx1[H(3)] == (hashindex.MAX_VALUE, 0, 0)
|
assert idx1[H(3)] == (ChunkIndex.MAX_VALUE, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def test_nsindex_segment_limit():
|
def test_nsindex_segment_limit():
|
||||||
idx = NSIndex()
|
idx = NSIndex()
|
||||||
with pytest.raises(AssertionError):
|
with pytest.raises(AssertionError):
|
||||||
idx[H(1)] = hashindex.MAX_VALUE + 1, 0
|
idx[H(1)] = NSIndex.MAX_VALUE + 1, 0
|
||||||
assert H(1) not in idx
|
assert H(1) not in idx
|
||||||
idx[H(2)] = hashindex.MAX_VALUE, 0
|
idx[H(2)] = NSIndex.MAX_VALUE, 0
|
||||||
assert H(2) in idx
|
assert H(2) in idx
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_load_factor():
|
||||||
|
assert NSIndex.MAX_LOAD_FACTOR < 1.0
|
||||||
|
assert ChunkIndex.MAX_LOAD_FACTOR < 1.0
|
||||||
|
|
Loading…
Reference in New Issue