From b2117bfd4f9b1a08a1e7b7d812d4eb5a2aad7b9e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 04:40:27 +0100 Subject: [PATCH 01/10] _hashindex: easier to understand code, dubious loops removed, asserts hashindex_index returns the perfect hashtable index, but does not check what's in the bucket there, so we had these loops afterwards to search for an empty or deleted bucket. problem: if the HT were completely filled with no empty and no deleted buckets, that loop would never end. due to our HT resizing, it can never happen, but still not pretty. when using hashindex_lookup (as also used some lines above), the code is easier to understand, because (after we resized the HT), we freshly create the same situation as after the first call of that function: - return value < 0, because we (still) can not find the key - start_idx will point to an empty bucket Thus, we do not need the problematic loops we had there. Modified the checks to make sure we really have an empty or deleted bucket before overwriting it with data. Added some additional asserts to make sure the code behaves. --- src/borg/_hashindex.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 7cd4cf31a..0f2b0c5b1 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -562,23 +562,23 @@ static int hashindex_set(HashIndex *index, const unsigned char *key, const void *value) { int start_idx; - int idx = hashindex_lookup(index, key, &start_idx); + int idx = hashindex_lookup(index, key, &start_idx); /* if idx < 0: start_idx -> EMPTY or DELETED */ uint8_t *ptr; if(idx < 0) { if(index->num_entries > index->upper_limit) { + /* hashtable too full, grow it! */ if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } - start_idx = hashindex_index(index, key); + /* we have just built a fresh hashtable and removed all tombstones, + * so we only have EMPTY or USED buckets, but no DELETED ones any more. + */ + idx = hashindex_lookup(index, key, &start_idx); + assert(idx < 0); + assert(BUCKET_IS_EMPTY(index, start_idx)); } idx = start_idx; - while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) { - idx++; - if (idx >= index->num_buckets){ - idx -= index->num_buckets; - } - } if(BUCKET_IS_EMPTY(index, idx)){ index->num_empty--; if(index->num_empty < index->min_empty) { @@ -589,14 +589,15 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) /* we have just built a fresh hashtable and removed all tombstones, * so we only have EMPTY or USED buckets, but no DELETED ones any more. */ - idx = start_idx = hashindex_index(index, key); - while(!BUCKET_IS_EMPTY(index, idx)) { - idx++; - if (idx >= index->num_buckets){ - idx -= index->num_buckets; - } - } + idx = hashindex_lookup(index, key, &start_idx); + assert(idx < 0); + assert(BUCKET_IS_EMPTY(index, start_idx)); + idx = start_idx; } + } else if(BUCKET_IS_DELETED(index, idx)) { + /* as expected, nothing to do */ + } else { + assert(0); /* bucket is full, must not happen! */ } ptr = BUCKET_ADDR(index, idx); memcpy(ptr, key, index->key_size); From ecf5ad43edd82b9546c971c1b48f1e116c94372b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:05:18 +0100 Subject: [PATCH 02/10] hashindex: simplify assert --- src/borg/_hashindex.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 0f2b0c5b1..71180a595 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -594,10 +594,9 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) assert(BUCKET_IS_EMPTY(index, start_idx)); idx = start_idx; } - } else if(BUCKET_IS_DELETED(index, idx)) { - /* as expected, nothing to do */ } else { - assert(0); /* bucket is full, must not happen! */ + /* Bucket must be either EMPTY (see above) or DELETED. */ + assert(BUCKET_IS_DELETED(index, idx)); } ptr = BUCKET_ADDR(index, idx); memcpy(ptr, key, index->key_size); From 3a44894aca3fc419c173b14c69c0bdaed3fbf49e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:09:13 +0100 Subject: [PATCH 03/10] bugfix: do not resize hashindex with wrong num_empty otherwise we would lose the decrement operation on num_empty. --- src/borg/_hashindex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 71180a595..477bfb1bf 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -580,8 +580,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) } idx = start_idx; if(BUCKET_IS_EMPTY(index, idx)){ - index->num_empty--; - if(index->num_empty < index->min_empty) { + if(index->num_empty <= index->min_empty) { /* too many tombstones here / not enough empty buckets, do a same-size rebuild */ if(!hashindex_resize(index, index->num_buckets)) { return 0; @@ -594,6 +593,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) assert(BUCKET_IS_EMPTY(index, start_idx)); idx = start_idx; } + index->num_empty--; } else { /* Bucket must be either EMPTY (see above) or DELETED. */ assert(BUCKET_IS_DELETED(index, idx)); From 32519617b6b6b35270215263d0087b4828bffa4d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:33:58 +0100 Subject: [PATCH 04/10] more comments for hashindex_lookup --- src/borg/_hashindex.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 477bfb1bf..1489c68f7 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -148,22 +148,24 @@ static int hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) { int didx = -1; - int start = hashindex_index(index, key); + int start = hashindex_index(index, key); /* perfect index for this key, if there is no collision. */ int idx = start; for(;;) { if(BUCKET_IS_EMPTY(index, idx)) { - break; + break; /* if we encounter an empty bucket, we do not need to look any further. */ } if(BUCKET_IS_DELETED(index, idx)) { if(didx == -1) { - didx = idx; + didx = idx; /* remember the index of the first deleted bucket. */ } } else if(BUCKET_MATCHES_KEY(index, idx, key)) { + /* we found the bucket with the key we are looking for! */ if (didx != -1) { // note: although lookup is logically a read-only operation, - // we optimize (change) the hashindex here "on the fly". + // we optimize (change) the hashindex here "on the fly": + // swap this full bucket with a previous deleted/tombstone bucket. memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size); BUCKET_MARK_DELETED(index, idx); idx = didx; @@ -175,10 +177,16 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) idx -= index->num_buckets; } if(idx == start) { + /* we have done a full pass over all buckets. */ break; } } + /* we get here if we did not find a bucket with the key we searched for. */ if (start_idx != NULL) { + /* by giving a non-NULL pointer in start_idx, caller can request to + * get the index of the first empty or deleted bucket we encountered, + * e.g. to add a new entry for that key into that bucket. + */ (*start_idx) = (didx == -1) ? idx : didx; } return -1; From 51189e1383576679e7c52d0d4d62854209fc9ab0 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:04:15 +0100 Subject: [PATCH 05/10] if HT is full with entries and tombstones: give up/fail early --- src/borg/_hashindex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 1489c68f7..2a44d96da 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -177,8 +177,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) idx -= index->num_buckets; } if(idx == start) { - /* we have done a full pass over all buckets. */ - break; + /* We have done a full pass over all buckets. + * - We did not find a bucket with the key we searched for. + * - We did not find an empty bucket either. + * So all buckets are either full or deleted/tombstones. + * This is an invalid state we never should get into, see + * upper_limit and min_empty. + */ + assert(0); /* should never happen - something is wrong here. */ } } /* we get here if we did not find a bucket with the key we searched for. */ From 241eaec4133deb23ad5eda7c982a92aceac3b9f9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:13:30 +0100 Subject: [PATCH 06/10] implement ht idx wrap around less strangely, add comment --- src/borg/_hashindex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 2a44d96da..31473051a 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -173,8 +173,8 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) return idx; } idx++; - if (idx >= index->num_buckets) { - idx -= index->num_buckets; + if (idx >= index->num_buckets) { /* triggers at == already */ + idx = 0; } if(idx == start) { /* We have done a full pass over all buckets. From ec32413b5e15f5583c4c45f2d956e381140ad41a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:23:10 +0100 Subject: [PATCH 07/10] hashindex: always have at least 1 empty bucket avoid rounding / integer conversion issues bringing this down to 0. --- src/borg/_hashindex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 31473051a..aff440974 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -241,8 +241,10 @@ int get_upper_limit(int num_buckets){ } int get_min_empty(int num_buckets){ - /* Differently from load, the effective load also considers tombstones (deleted buckets). */ - return (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD)); + /* Differently from load, the effective load also considers tombstones (deleted buckets). + * We always add 1, so this never can return 0 (0 empty buckets would be a bad HT state). + */ + return 1 + (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD)); } int size_idx(int size){ From d57fafadadcedac7ec55379757b0e24e27f5e20f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 19:50:44 +0100 Subject: [PATCH 08/10] Simplify full HT scan assertion --- src/borg/_hashindex.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index aff440974..e40d6dfba 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -176,16 +176,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) if (idx >= index->num_buckets) { /* triggers at == already */ idx = 0; } - if(idx == start) { - /* We have done a full pass over all buckets. - * - We did not find a bucket with the key we searched for. - * - We did not find an empty bucket either. - * So all buckets are either full or deleted/tombstones. - * This is an invalid state we never should get into, see - * upper_limit and min_empty. - */ - assert(0); /* should never happen - something is wrong here. */ - } + /* When idx == start, we have done a full pass over all buckets. + * - We did not find a bucket with the key we searched for. + * - We did not find an empty bucket either. + * So all buckets are either full or deleted/tombstones. + * This is an invalid state we never should get into, see + * upper_limit and min_empty. + */ + assert(idx != start); } /* we get here if we did not find a bucket with the key we searched for. */ if (start_idx != NULL) { From f1d68fe4b3227c103875bea03465bee2f241b22d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 19:59:51 +0100 Subject: [PATCH 09/10] hashindex: simplify size_idx function Thanks to @jdchristensen for the code. --- src/borg/_hashindex.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index e40d6dfba..7c73e109f 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -246,16 +246,10 @@ int get_min_empty(int num_buckets){ } int size_idx(int size){ - /* find the hash_sizes index with entry >= size */ - int elems = NELEMS(hash_sizes); - int entry, i=0; - do{ - entry = hash_sizes[i++]; - }while((entry < size) && (i < elems)); - if (i >= elems) - return elems - 1; - i--; - return i; + /* find the smallest hash_sizes index with entry >= size */ + int i = NELEMS(hash_sizes) - 1; + while(i >= 0 && hash_sizes[i] >= size) i--; + return i + 1; } int fit_size(int current){ From a7ce1db52903c5cf1169b6aa8b61e4fa77c841d2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 22:45:11 +0100 Subject: [PATCH 10/10] add num_entries assertion --- src/borg/_hashindex.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 7c73e109f..deeadaab9 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -213,6 +213,8 @@ hashindex_resize(HashIndex *index, int capacity) return 0; } } + assert(index->num_entries == new->num_entries); + hashindex_free_buckets(index); index->buckets = new->buckets; index->num_buckets = new->num_buckets;