From 9fe0140d94dcb7bc65f02cb400ea6a294b0a2ac7 Mon Sep 17 00:00:00 2001
From: Marian Beermann <public@enkore.de>
Date: Wed, 7 Sep 2016 16:08:07 +0200
Subject: [PATCH 1/3] hashindex: export max load factor to Python-space

---
 borg/hashindex.pyx          | 3 +++
 borg/testsuite/hashindex.py | 5 +++++
 2 files changed, 8 insertions(+)
diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx
index 59741ad6e..ce1dac047 100644
--- a/borg/hashindex.pyx
+++ b/borg/hashindex.pyx
@@ -23,6 +23,8 @@ cdef extern from "_hashindex.c":
     uint32_t _htole32(uint32_t v)
     uint32_t _le32toh(uint32_t v)
 
+    double HASH_MAX_LOAD
+
 
 cdef _NoDefault = object()
 
@@ -54,6 +56,7 @@ cdef class IndexBase:
     cdef HashIndex *index
     cdef int key_size
 
+    MAX_LOAD_FACTOR = HASH_MAX_LOAD
     def __cinit__(self, capacity=0, path=None, key_size=32):
         self.key_size = key_size
         if path:
diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py
index 75cd80227..4a6bd4432 100644
--- a/borg/testsuite/hashindex.py
+++ b/borg/testsuite/hashindex.py
@@ -276,3 +276,8 @@ def test_nsindex_segment_limit():
     assert H(1) not in idx
     idx[H(2)] = hashindex.MAX_VALUE, 0
     assert H(2) in idx
+
+
+def test_max_load_factor():
+    assert NSIndex.MAX_LOAD_FACTOR < 1
+    assert ChunkIndex.MAX_LOAD_FACTOR < 1

From 197552526ff52c2a0473c6a000e34597c8a90ac3 Mon Sep 17 00:00:00 2001
From: Marian Beermann <public@enkore.de>
Date: Wed, 7 Sep 2016 16:08:35 +0200
Subject: [PATCH 2/3] hashindex: make MAX_VALUE a class constant

---
 borg/hashindex.pyx          |  7 ++++---
 borg/testsuite/hashindex.py | 38 ++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx
index ce1dac047..c32c4dd1a 100644
--- a/borg/hashindex.pyx
+++ b/borg/hashindex.pyx
@@ -47,7 +47,6 @@ assert UINT32_MAX == 2**32-1
 
 # module-level constant because cdef's in classes can't have default values
 cdef uint32_t _MAX_VALUE = 2**32-1025
-MAX_VALUE = _MAX_VALUE
 
 assert _MAX_VALUE % 2 == 1
 
@@ -57,6 +56,8 @@ cdef class IndexBase:
     cdef int key_size
 
     MAX_LOAD_FACTOR = HASH_MAX_LOAD
+    MAX_VALUE = _MAX_VALUE
+
     def __cinit__(self, capacity=0, path=None, key_size=32):
         self.key_size = key_size
         if path:
@@ -283,7 +284,7 @@ cdef class ChunkIndex(IndexBase):
             unique_chunks += 1
             values = <uint32_t*> (key + self.key_size)
             refcount = _le32toh(values[0])
-            assert refcount <= MAX_VALUE, "invalid reference count"
+            assert refcount <= _MAX_VALUE, "invalid reference count"
             chunks += refcount
             unique_size += _le32toh(values[1])
             unique_csize += _le32toh(values[2])
@@ -343,5 +344,5 @@ cdef class ChunkKeyIterator:
             raise StopIteration
         cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
         cdef uint32_t refcount = _le32toh(value[0])
-        assert refcount <= MAX_VALUE, "invalid reference count"
+        assert refcount <= _MAX_VALUE, "invalid reference count"
         return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))
diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py
index 4a6bd4432..b81cbf47f 100644
--- a/borg/testsuite/hashindex.py
+++ b/borg/testsuite/hashindex.py
@@ -124,16 +124,16 @@ class HashIndexTestCase(BaseTestCase):
 class HashIndexRefcountingTestCase(BaseTestCase):
     def test_chunkindex_limit(self):
         idx = ChunkIndex()
-        idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2
+        idx[H(1)] = ChunkIndex.MAX_VALUE - 1, 1, 2
 
         # 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited
         for i in range(5):
             # first incref to move it to the limit
             refcount, *_ = idx.incref(H(1))
-            assert refcount == hashindex.MAX_VALUE
+            assert refcount == ChunkIndex.MAX_VALUE
         for i in range(5):
             refcount, *_ = idx.decref(H(1))
-            assert refcount == hashindex.MAX_VALUE
+            assert refcount == ChunkIndex.MAX_VALUE
 
     def _merge(self, refcounta, refcountb):
         def merge(refcount1, refcount2):
@@ -152,23 +152,23 @@ class HashIndexRefcountingTestCase(BaseTestCase):
     def test_chunkindex_merge_limit1(self):
         # Check that it does *not* limit at MAX_VALUE - 1
         # (MAX_VALUE is odd)
-        half = hashindex.MAX_VALUE // 2
-        assert self._merge(half, half) == hashindex.MAX_VALUE - 1
+        half = ChunkIndex.MAX_VALUE // 2
+        assert self._merge(half, half) == ChunkIndex.MAX_VALUE - 1
 
     def test_chunkindex_merge_limit2(self):
         # 3000000000 + 2000000000 > MAX_VALUE
-        assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE
+        assert self._merge(3000000000, 2000000000) == ChunkIndex.MAX_VALUE
 
     def test_chunkindex_merge_limit3(self):
         # Crossover point: both addition and limit semantics will yield the same result
-        half = hashindex.MAX_VALUE // 2
-        assert self._merge(half + 1, half) == hashindex.MAX_VALUE
+        half = ChunkIndex.MAX_VALUE // 2
+        assert self._merge(half + 1, half) == ChunkIndex.MAX_VALUE
 
     def test_chunkindex_merge_limit4(self):
         # Beyond crossover, result of addition would be 2**31
-        half = hashindex.MAX_VALUE // 2
-        assert self._merge(half + 2, half) == hashindex.MAX_VALUE
-        assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE
+        half = ChunkIndex.MAX_VALUE // 2
+        assert self._merge(half + 2, half) == ChunkIndex.MAX_VALUE
+        assert self._merge(half + 1, half + 1) == ChunkIndex.MAX_VALUE
 
     def test_chunkindex_add(self):
         idx1 = ChunkIndex()
@@ -179,17 +179,17 @@ class HashIndexRefcountingTestCase(BaseTestCase):
 
     def test_incref_limit(self):
         idx1 = ChunkIndex()
-        idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7)
+        idx1[H(1)] = (ChunkIndex.MAX_VALUE, 6, 7)
         idx1.incref(H(1))
         refcount, *_ = idx1[H(1)]
-        assert refcount == hashindex.MAX_VALUE
+        assert refcount == ChunkIndex.MAX_VALUE
 
     def test_decref_limit(self):
         idx1 = ChunkIndex()
-        idx1[H(1)] = hashindex.MAX_VALUE, 6, 7
+        idx1[H(1)] = ChunkIndex.MAX_VALUE, 6, 7
         idx1.decref(H(1))
         refcount, *_ = idx1[H(1)]
-        assert refcount == hashindex.MAX_VALUE
+        assert refcount == ChunkIndex.MAX_VALUE
 
     def test_decref_zero(self):
         idx1 = ChunkIndex()
@@ -209,7 +209,7 @@ class HashIndexRefcountingTestCase(BaseTestCase):
     def test_setitem_raises(self):
         idx1 = ChunkIndex()
         with pytest.raises(AssertionError):
-            idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0
+            idx1[H(1)] = ChunkIndex.MAX_VALUE + 1, 0, 0
 
     def test_keyerror(self):
         idx = ChunkIndex()
@@ -266,15 +266,15 @@ class HashIndexDataTestCase(BaseTestCase):
         idx2 = ChunkIndex()
         idx2[H(3)] = 2**32 - 123456, 6, 7
         idx1.merge(idx2)
-        assert idx1[H(3)] == (hashindex.MAX_VALUE, 0, 0)
+        assert idx1[H(3)] == (ChunkIndex.MAX_VALUE, 0, 0)
 
 
 def test_nsindex_segment_limit():
     idx = NSIndex()
     with pytest.raises(AssertionError):
-        idx[H(1)] = hashindex.MAX_VALUE + 1, 0
+        idx[H(1)] = NSIndex.MAX_VALUE + 1, 0
     assert H(1) not in idx
-    idx[H(2)] = hashindex.MAX_VALUE, 0
+    idx[H(2)] = NSIndex.MAX_VALUE, 0
     assert H(2) in idx
 
 

From be3616b6b391ae16709260c0b199f20be2330ef7 Mon Sep 17 00:00:00 2001
From: Marian Beermann <public@enkore.de>
Date: Fri, 9 Sep 2016 16:11:06 +0200
Subject: [PATCH 3/3] ArchiveChecker: use MAX_LOAD_FACTOR constant

---
 borg/archive.py             | 5 +++--
 borg/testsuite/hashindex.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/borg/archive.py b/borg/archive.py
index a3a133171..e6dd39557 100644
--- a/borg/archive.py
+++ b/borg/archive.py
@@ -853,8 +853,9 @@ class ArchiveChecker:
         """Fetch a list of all object keys from repository
         """
         # Explicitly set the initial hash table capacity to avoid performance issues
-        # due to hash table "resonance"
-        capacity = int(len(self.repository) * 1.35 + 1)  # > len * 1.0 / HASH_MAX_LOAD (see _hashindex.c)
+        # due to hash table "resonance".
+        # Since reconstruction of archive items can add some new chunks, add 10 % headroom
+        capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR * 1.1)
         self.chunks = ChunkIndex(capacity)
         marker = None
         while True:
diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py
index b81cbf47f..629ae4e57 100644
--- a/borg/testsuite/hashindex.py
+++ b/borg/testsuite/hashindex.py
@@ -279,5 +279,5 @@ def test_nsindex_segment_limit():
 
 
 def test_max_load_factor():
-    assert NSIndex.MAX_LOAD_FACTOR < 1
-    assert ChunkIndex.MAX_LOAD_FACTOR < 1
+    assert NSIndex.MAX_LOAD_FACTOR < 1.0
+    assert ChunkIndex.MAX_LOAD_FACTOR < 1.0