From fd0649767a734083cb38d72d8e12def8b0557c64 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 3 Mar 2017 15:41:08 +0100
Subject: [PATCH] hashindex: rebuild hashtable if we have too little empty
 buckets, fixes #2246

if there are too many deleted buckets (tombstones), hashtable performance goes down the drain.
in the worst case of 0 empty buckets and lots of tombstones, this results in full table scans for
new / unknown keys.
thus we make sure we always have a good amount of empty buckets.
---
 src/borg/_hashindex.c | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c
index adcb90fd7..51290c5a1 100644
--- a/src/borg/_hashindex.c
+++ b/src/borg/_hashindex.c
@@ -47,11 +47,13 @@ typedef struct {
     void *buckets;
     int num_entries;
     int num_buckets;
+    int num_empty;
     int key_size;
     int value_size;
     off_t bucket_size;
     int lower_limit;
     int upper_limit;
+    int min_empty;
 } HashIndex;
 
 /* prime (or w/ big prime factors) hash table sizes
@@ -77,6 +79,7 @@ static int hash_sizes[] = {
 
 #define HASH_MIN_LOAD .25
 #define HASH_MAX_LOAD .75  /* don't go higher than 0.75, otherwise performance severely suffers! */
+#define HASH_MAX_EFF_LOAD .93
 
 #define MAX(x, y) ((x) > (y) ? (x): (y))
 #define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
@@ -171,8 +174,10 @@ hashindex_resize(HashIndex *index, int capacity)
     free(index->buckets);
     index->buckets = new->buckets;
     index->num_buckets = new->num_buckets;
+    index->num_empty = index->num_buckets - index->num_entries;
     index->lower_limit = new->lower_limit;
     index->upper_limit = new->upper_limit;
+    index->min_empty = new->min_empty;
     free(new);
     return 1;
 }
@@ -191,6 +196,11 @@ int get_upper_limit(int num_buckets){
     return (int)(num_buckets * HASH_MAX_LOAD);
 }
 
+int get_min_empty(int num_buckets){
+    /* Differently from load, the effective load also considers tombstones (deleted buckets). */
+    return (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD));
+}
+
 int size_idx(int size){
     /* find the hash_sizes index with entry >= size */
     int elems = NELEMS(hash_sizes);
@@ -224,6 +234,19 @@ int shrink_size(int current){
     return hash_sizes[i];
 }
 
+int
+count_empty(HashIndex *index)
+{   /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones).
+     * TODO: if we ever change HashHeader, save the count there so we do not need this function.
+     */
+    int i, count = 0, capacity = index->num_buckets;
+    for(i = 0; i < capacity; i++) {
+        if(BUCKET_IS_EMPTY(index, i))
+            count++;
+    }
+    return count;
+}
+
 /* Public API */
 static HashIndex *
 hashindex_read(const char *path)
@@ -303,6 +326,17 @@ hashindex_read(const char *path)
     index->bucket_size = index->key_size + index->value_size;
     index->lower_limit = get_lower_limit(index->num_buckets);
     index->upper_limit = get_upper_limit(index->num_buckets);
+    index->min_empty = get_min_empty(index->num_buckets);
+    index->num_empty = count_empty(index);
+    if(index->num_empty < index->min_empty) {
+        /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
+        if(!hashindex_resize(index, index->num_buckets)) {
+            free(index->buckets);
+            free(index);
+            index = NULL;
+            goto fail;
+        }
+    }
 fail:
     if(fclose(fd) < 0) {
         EPRINTF_PATH(path, "fclose failed");
@@ -330,9 +364,11 @@ hashindex_init(int capacity, int key_size, int value_size)
     index->key_size = key_size;
     index->value_size = value_size;
     index->num_buckets = capacity;
+    index->num_empty = capacity;
     index->bucket_size = index->key_size + index->value_size;
     index->lower_limit = get_lower_limit(index->num_buckets);
     index->upper_limit = get_upper_limit(index->num_buckets);
+    index->min_empty = get_min_empty(index->num_buckets);
     for(i = 0; i < capacity; i++) {
         BUCKET_MARK_EMPTY(index, i);
     }
@@ -406,6 +442,15 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
         while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) {
             idx = (idx + 1) % index->num_buckets;
         }
+        if(BUCKET_IS_EMPTY(index, idx)){
+            index->num_empty--;
+            if(index->num_empty < index->min_empty) {
+                /* too many tombstones here / not enough empty buckets, do a same-size rebuild */
+                if(!hashindex_resize(index, index->num_buckets)) {
+                    return 0;
+                }
+            }
+        }
         ptr = BUCKET_ADDR(index, idx);
         memcpy(ptr, key, index->key_size);
         memcpy(ptr + index->key_size, value, index->value_size);