misc. hash table tuning

BUCKET_UPPER_LIMIT: 90% load degrades hash table performance severely, so I lowered that to 75% (which is a usual value - java uses 75%, python uses 66%). I chose the higher value of both because we also should not consume too much memory, considering the RAM usage already is rather high. MIN_BUCKETS: I can't explain why, but benchmarks showed that choosing 2^N as table size severely degrades performance (by 3 orders of magnitude!). So a prime start value improves this a lot, even if we later still use the grow-by-2x algorithm. hashindex_resize: removed the hashindex_get() call as we already know that the values come at key + key_size address. hashindex_init: do not calloc X*Y elements of size 1, but rather X elements of size Y. Makes the code simpler, not sure if it affects performance. The tests needed fixing as the resulting hashtable blob is now of course different due to the above changes, so its sha hash changed.
2025-03-15 00:21:56 +00:00 · 2015-12-01 21:18:58 +01:00 · 2015-12-01 21:18:58 +01:00 · 610300c1ce
commit 610300c1ce
parent a8227aeda0
2 changed files with 8 additions and 9 deletions
--- a/borg/_hashindex.c
+++ b/borg/_hashindex.c
@ -44,8 +44,8 @@ typedef struct {
 #define DELETED _htole32(0xfffffffe)
 #define MAX_BUCKET_SIZE 512
 #define BUCKET_LOWER_LIMIT .25
-#define BUCKET_UPPER_LIMIT .90
-#define MIN_BUCKETS 1024
+#define BUCKET_UPPER_LIMIT .75  /* don't go higher than 0.75, otherwise performance severely suffers! */
+#define MIN_BUCKETS 1031  /* must be prime, otherwise performance breaks down! */
 #define MAX(x, y) ((x) > (y) ? (x): (y))
 #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size))

@ -113,12 +113,13 @@ hashindex_resize(HashIndex *index, int capacity)
 {
    HashIndex *new;
    void *key = NULL;
+    int32_t key_size = index->key_size;

-    if(!(new = hashindex_init(capacity, index->key_size, index->value_size))) {
+    if(!(new = hashindex_init(capacity, key_size, index->value_size))) {
        return 0;
    }
    while((key = hashindex_next_key(index, key))) {
-        hashindex_set(new, key, hashindex_get(index, key));
+        hashindex_set(new, key, key + key_size);
    }
    free(index->buckets);
    index->buckets = new->buckets;
@ -218,7 +219,6 @@ fail:
 static HashIndex *
 hashindex_init(int capacity, int key_size, int value_size)
 {
-    off_t buckets_length;
    HashIndex *index;
    int i;
    capacity = MAX(MIN_BUCKETS, capacity);
@ -227,8 +227,7 @@ hashindex_init(int capacity, int key_size, int value_size)
        EPRINTF("malloc header failed");
        return NULL;
    }
-    buckets_length = (off_t)capacity * (key_size + value_size);
-    if(!(index->buckets = calloc(buckets_length, 1))) {
+    if(!(index->buckets = calloc(capacity, key_size + value_size))) {
        EPRINTF("malloc buckets failed");
        free(index);
        return NULL;
--- a/borg/testsuite/hashindex.py
+++ b/borg/testsuite/hashindex.py
@ -51,11 +51,11 @@ class HashIndexTestCase(BaseTestCase):

    def test_nsindex(self):
        self._generic_test(NSIndex, lambda x: (x, x),
-                           '861d6d60069ea45e39d36bed2bdc1d0c07981e0641955f897ac6848be429abac')
+                           '80fba5b40f8cf12f1486f1ba33c9d852fb2b41a5b5961d3b9d1228cf2aa9c4c9')

    def test_chunkindex(self):
        self._generic_test(ChunkIndex, lambda x: (x, x, x),
-                           '69464bd0ebbc5866b9f95d838bc48617d21bfe3dcf294682a5c21a2ef6b9dc0b')
+                           '1d71865e72e3c3af18d3c7216b6fa7b014695eaa3ed7f14cf9cd02fba75d1c95')

    def test_resize(self):
        n = 2000  # Must be >= MIN_BUCKETS