Merge pull request #3898 from ThomasWaldmann/move-ht-load-calculation

move capacity calculation to IndexBase, fixes #2646
2024-12-26 17:57:59 +00:00 · 2018-06-14 22:23:10 +02:00 · 2018-06-14 22:23:10 +02:00 · 3bdfa869d6
commit 3bdfa869d6
parent 37f2c8944f de113bab23
3 changed files with 12 additions and 12 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1332,11 +1332,10 @@ def check(self, repository, repair=False, archive=None, first=0, last=0, sort_by
    def init_chunks(self):
        """Fetch a list of all object keys from repository
        """
-        # Explicitly set the initial hash table capacity to avoid performance issues
+        # Explicitly set the initial usable hash table capacity to avoid performance issues
        # due to hash table "resonance".
-        # Since reconstruction of archive items can add some new chunks, add 10 % headroom
-        capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR * 1.1)
-        self.chunks = ChunkIndex(capacity)
+        # Since reconstruction of archive items can add some new chunks, add 10 % headroom.
+        self.chunks = ChunkIndex(usable=len(self.repository) * 1.1)
        marker = None
        while True:
            result = self.repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -780,11 +780,11 @@ def create_master_idx(chunk_idx):
            # deallocates old hashindex, creates empty hashindex:
            chunk_idx.clear()
            cleanup_outdated(cached_ids - archive_ids)
-            # Explicitly set the initial hash table capacity to avoid performance issues
+            # Explicitly set the usable initial hash table capacity to avoid performance issues
            # due to hash table "resonance".
-            master_index_capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR)
+            master_index_capacity = len(self.repository)
            if archive_ids:
-                chunk_idx = None if not self.do_cache else ChunkIndex(master_index_capacity)
+                chunk_idx = None if not self.do_cache else ChunkIndex(usable=master_index_capacity)
                pi = ProgressIndicatorPercent(total=len(archive_ids), step=0.1,
                                              msg='%3.0f%% Syncing chunks cache. Processing archive %s',
                                              msgid='cache.sync')
@ -805,7 +805,7 @@ def create_master_idx(chunk_idx):
                        logger.info("Merging into master chunks index ...")
                        chunk_idx.merge(archive_chunk_idx)
                    else:
-                        chunk_idx = chunk_idx or ChunkIndex(master_index_capacity)
+                        chunk_idx = chunk_idx or ChunkIndex(usable=master_index_capacity)
                        logger.info('Fetching archive index for %s ...', archive_name)
                        fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx)
                if not self.do_cache:
@ -1087,12 +1087,11 @@ def rollback(self):

    def begin_txn(self):
        self._txn_active = True
-        # Explicitly set the initial hash table capacity to avoid performance issues
+        # Explicitly set the initial usable hash table capacity to avoid performance issues
        # due to hash table "resonance".
        # Since we're creating an archive, add 10 % from the start.
        num_chunks = len(self.repository)
-        capacity = int(num_chunks / ChunkIndex.MAX_LOAD_FACTOR * 1.1)
-        self.chunks = ChunkIndex(capacity)
+        self.chunks = ChunkIndex(usable=num_chunks * 1.1)
        pi = ProgressIndicatorPercent(total=num_chunks, msg='Downloading chunk list... %3.0f%%',
                                      msgid='cache.download_chunks')
        t0 = perf_counter()
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@ -84,7 +84,7 @@ cdef class IndexBase:
    MAX_LOAD_FACTOR = HASH_MAX_LOAD
    MAX_VALUE = _MAX_VALUE

-    def __cinit__(self, capacity=0, path=None, permit_compact=False):
+    def __cinit__(self, capacity=0, path=None, permit_compact=False, usable=None):
        self.key_size = self._key_size
        if path:
            if isinstance(path, (str, bytes)):
@ -94,6 +94,8 @@ cdef class IndexBase:
                self.index = hashindex_read(path, permit_compact)
            assert self.index, 'hashindex_read() returned NULL with no exception set'
        else:
+            if usable is not None:
+                capacity = int(usable / self.MAX_LOAD_FACTOR)
            self.index = hashindex_init(capacity, self.key_size, self.value_size)
            if not self.index:
                raise Exception('hashindex_init failed')