From 9fe0140d94dcb7bc65f02cb400ea6a294b0a2ac7 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Wed, 7 Sep 2016 16:08:07 +0200 Subject: [PATCH 1/3] hashindex: export max load factor to Python-space --- borg/hashindex.pyx | 3 +++ borg/testsuite/hashindex.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx index 59741ad6e..ce1dac047 100644 --- a/borg/hashindex.pyx +++ b/borg/hashindex.pyx @@ -23,6 +23,8 @@ cdef extern from "_hashindex.c": uint32_t _htole32(uint32_t v) uint32_t _le32toh(uint32_t v) + double HASH_MAX_LOAD + cdef _NoDefault = object() @@ -54,6 +56,7 @@ cdef class IndexBase: cdef HashIndex *index cdef int key_size + MAX_LOAD_FACTOR = HASH_MAX_LOAD def __cinit__(self, capacity=0, path=None, key_size=32): self.key_size = key_size if path: diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py index 75cd80227..4a6bd4432 100644 --- a/borg/testsuite/hashindex.py +++ b/borg/testsuite/hashindex.py @@ -276,3 +276,8 @@ def test_nsindex_segment_limit(): assert H(1) not in idx idx[H(2)] = hashindex.MAX_VALUE, 0 assert H(2) in idx + + +def test_max_load_factor(): + assert NSIndex.MAX_LOAD_FACTOR < 1 + assert ChunkIndex.MAX_LOAD_FACTOR < 1 From 197552526ff52c2a0473c6a000e34597c8a90ac3 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Wed, 7 Sep 2016 16:08:35 +0200 Subject: [PATCH 2/3] hashindex: make MAX_VALUE a class constant --- borg/hashindex.pyx | 7 ++++--- borg/testsuite/hashindex.py | 38 ++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx index ce1dac047..c32c4dd1a 100644 --- a/borg/hashindex.pyx +++ b/borg/hashindex.pyx @@ -47,7 +47,6 @@ assert UINT32_MAX == 2**32-1 # module-level constant because cdef's in classes can't have default values cdef uint32_t _MAX_VALUE = 2**32-1025 -MAX_VALUE = _MAX_VALUE assert _MAX_VALUE % 2 == 1 @@ -57,6 +56,8 @@ cdef class IndexBase: cdef int key_size MAX_LOAD_FACTOR = HASH_MAX_LOAD + MAX_VALUE = _MAX_VALUE + def __cinit__(self, capacity=0, path=None, key_size=32): self.key_size = key_size if path: @@ -283,7 +284,7 @@ cdef class ChunkIndex(IndexBase): unique_chunks += 1 values = (key + self.key_size) refcount = _le32toh(values[0]) - assert refcount <= MAX_VALUE, "invalid reference count" + assert refcount <= _MAX_VALUE, "invalid reference count" chunks += refcount unique_size += _le32toh(values[1]) unique_csize += _le32toh(values[2]) @@ -343,5 +344,5 @@ cdef class ChunkKeyIterator: raise StopIteration cdef uint32_t *value = (self.key + self.key_size) cdef uint32_t refcount = _le32toh(value[0]) - assert refcount <= MAX_VALUE, "invalid reference count" + assert refcount <= _MAX_VALUE, "invalid reference count" return (self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2])) diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py index 4a6bd4432..b81cbf47f 100644 --- a/borg/testsuite/hashindex.py +++ b/borg/testsuite/hashindex.py @@ -124,16 +124,16 @@ class HashIndexTestCase(BaseTestCase): class HashIndexRefcountingTestCase(BaseTestCase): def test_chunkindex_limit(self): idx = ChunkIndex() - idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2 + idx[H(1)] = ChunkIndex.MAX_VALUE - 1, 1, 2 # 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited for i in range(5): # first incref to move it to the limit refcount, *_ = idx.incref(H(1)) - assert refcount == hashindex.MAX_VALUE + assert refcount == ChunkIndex.MAX_VALUE for i in range(5): refcount, *_ = idx.decref(H(1)) - assert refcount == hashindex.MAX_VALUE + assert refcount == ChunkIndex.MAX_VALUE def _merge(self, refcounta, refcountb): def merge(refcount1, refcount2): @@ -152,23 +152,23 @@ class HashIndexRefcountingTestCase(BaseTestCase): def test_chunkindex_merge_limit1(self): # Check that it does *not* limit at MAX_VALUE - 1 # (MAX_VALUE is odd) - half = hashindex.MAX_VALUE // 2 - assert self._merge(half, half) == hashindex.MAX_VALUE - 1 + half = ChunkIndex.MAX_VALUE // 2 + assert self._merge(half, half) == ChunkIndex.MAX_VALUE - 1 def test_chunkindex_merge_limit2(self): # 3000000000 + 2000000000 > MAX_VALUE - assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE + assert self._merge(3000000000, 2000000000) == ChunkIndex.MAX_VALUE def test_chunkindex_merge_limit3(self): # Crossover point: both addition and limit semantics will yield the same result - half = hashindex.MAX_VALUE // 2 - assert self._merge(half + 1, half) == hashindex.MAX_VALUE + half = ChunkIndex.MAX_VALUE // 2 + assert self._merge(half + 1, half) == ChunkIndex.MAX_VALUE def test_chunkindex_merge_limit4(self): # Beyond crossover, result of addition would be 2**31 - half = hashindex.MAX_VALUE // 2 - assert self._merge(half + 2, half) == hashindex.MAX_VALUE - assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE + half = ChunkIndex.MAX_VALUE // 2 + assert self._merge(half + 2, half) == ChunkIndex.MAX_VALUE + assert self._merge(half + 1, half + 1) == ChunkIndex.MAX_VALUE def test_chunkindex_add(self): idx1 = ChunkIndex() @@ -179,17 +179,17 @@ class HashIndexRefcountingTestCase(BaseTestCase): def test_incref_limit(self): idx1 = ChunkIndex() - idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7) + idx1[H(1)] = (ChunkIndex.MAX_VALUE, 6, 7) idx1.incref(H(1)) refcount, *_ = idx1[H(1)] - assert refcount == hashindex.MAX_VALUE + assert refcount == ChunkIndex.MAX_VALUE def test_decref_limit(self): idx1 = ChunkIndex() - idx1[H(1)] = hashindex.MAX_VALUE, 6, 7 + idx1[H(1)] = ChunkIndex.MAX_VALUE, 6, 7 idx1.decref(H(1)) refcount, *_ = idx1[H(1)] - assert refcount == hashindex.MAX_VALUE + assert refcount == ChunkIndex.MAX_VALUE def test_decref_zero(self): idx1 = ChunkIndex() @@ -209,7 +209,7 @@ class HashIndexRefcountingTestCase(BaseTestCase): def test_setitem_raises(self): idx1 = ChunkIndex() with pytest.raises(AssertionError): - idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0 + idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0, 0 def test_keyerror(self): idx = ChunkIndex() @@ -266,15 +266,15 @@ class HashIndexDataTestCase(BaseTestCase): idx2 = ChunkIndex() idx2[H(3)] = 2**32 - 123456, 6, 7 idx1.merge(idx2) - assert idx1[H(3)] == (hashindex.MAX_VALUE, 0, 0) + assert idx1[H(3)] == (ChunkIndex.MAX_VALUE, 0, 0) def test_nsindex_segment_limit(): idx = NSIndex() with pytest.raises(AssertionError): - idx[H(1)] = hashindex.MAX_VALUE + 1, 0 + idx[H(1)] = NSIndex.MAX_VALUE + 1, 0 assert H(1) not in idx - idx[H(2)] = hashindex.MAX_VALUE, 0 + idx[H(2)] = NSIndex.MAX_VALUE, 0 assert H(2) in idx From be3616b6b391ae16709260c0b199f20be2330ef7 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Fri, 9 Sep 2016 16:11:06 +0200 Subject: [PATCH 3/3] ArchiveChecker: use MAX_LOAD_FACTOR constant --- borg/archive.py | 5 +++-- borg/testsuite/hashindex.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/borg/archive.py b/borg/archive.py index a3a133171..e6dd39557 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -853,8 +853,9 @@ class ArchiveChecker: """Fetch a list of all object keys from repository """ # Explicitly set the initial hash table capacity to avoid performance issues - # due to hash table "resonance" - capacity = int(len(self.repository) * 1.35 + 1) # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c) + # due to hash table "resonance". + # Since reconstruction of archive items can add some new chunks, add 10 % headroom + capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR * 1.1) self.chunks = ChunkIndex(capacity) marker = None while True: diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py index b81cbf47f..629ae4e57 100644 --- a/borg/testsuite/hashindex.py +++ b/borg/testsuite/hashindex.py @@ -279,5 +279,5 @@ def test_nsindex_segment_limit(): def test_max_load_factor(): - assert NSIndex.MAX_LOAD_FACTOR < 1 - assert ChunkIndex.MAX_LOAD_FACTOR < 1 + assert NSIndex.MAX_LOAD_FACTOR < 1.0 + assert ChunkIndex.MAX_LOAD_FACTOR < 1.0