From c9573c04ac56907dae16b96376cc364749e70a27 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 04:40:27 +0100 Subject: [PATCH 01/10] _hashindex: easier to understand code, dubious loops removed, asserts hashindex_index returns the perfect hashtable index, but does not check what's in the bucket there, so we had these loops afterwards to search for an empty or deleted bucket. problem: if the HT were completely filled with no empty and no deleted buckets, that loop would never end. due to our HT resizing, it can never happen, but still not pretty. when using hashindex_lookup (as also used some lines above), the code is easier to understand, because (after we resized the HT), we freshly create the same situation as after the first call of that function: - return value < 0, because we (still) can not find the key - start_idx will point to an empty bucket Thus, we do not need the problematic loops we had there. Modified the checks to make sure we really have an empty or deleted bucket before overwriting it with data. Added some additional asserts to make sure the code behaves. --- src/borg/_hashindex.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 5a006f913..6b632b37b 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -731,23 +731,23 @@ static int hashindex_set(HashIndex *index, const unsigned char *key, const void *value) { int start_idx; - int idx = hashindex_lookup(index, key, &start_idx); + int idx = hashindex_lookup(index, key, &start_idx); /* if idx < 0: start_idx -> EMPTY or DELETED */ uint8_t *ptr; if(idx < 0) { if(index->num_entries > index->upper_limit) { + /* hashtable too full, grow it! */ if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } - start_idx = hashindex_index(index, key); + /* we have just built a fresh hashtable and removed all tombstones, + * so we only have EMPTY or USED buckets, but no DELETED ones any more. + */ + idx = hashindex_lookup(index, key, &start_idx); + assert(idx < 0); + assert(BUCKET_IS_EMPTY(index, start_idx)); } idx = start_idx; - while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) { - idx++; - if (idx >= index->num_buckets){ - idx -= index->num_buckets; - } - } if(BUCKET_IS_EMPTY(index, idx)){ index->num_empty--; if(index->num_empty < index->min_empty) { @@ -758,14 +758,15 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) /* we have just built a fresh hashtable and removed all tombstones, * so we only have EMPTY or USED buckets, but no DELETED ones any more. */ - idx = start_idx = hashindex_index(index, key); - while(!BUCKET_IS_EMPTY(index, idx)) { - idx++; - if (idx >= index->num_buckets){ - idx -= index->num_buckets; - } - } + idx = hashindex_lookup(index, key, &start_idx); + assert(idx < 0); + assert(BUCKET_IS_EMPTY(index, start_idx)); + idx = start_idx; } + } else if(BUCKET_IS_DELETED(index, idx)) { + /* as expected, nothing to do */ + } else { + assert(0); /* bucket is full, must not happen! */ } ptr = BUCKET_ADDR(index, idx); memcpy(ptr, key, index->key_size); From 83e6b4269eb7cd608861d8a87abe7bd0512c00a5 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:05:18 +0100 Subject: [PATCH 02/10] hashindex: simplify assert --- src/borg/_hashindex.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 6b632b37b..da378f45c 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -763,10 +763,9 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) assert(BUCKET_IS_EMPTY(index, start_idx)); idx = start_idx; } - } else if(BUCKET_IS_DELETED(index, idx)) { - /* as expected, nothing to do */ } else { - assert(0); /* bucket is full, must not happen! */ + /* Bucket must be either EMPTY (see above) or DELETED. */ + assert(BUCKET_IS_DELETED(index, idx)); } ptr = BUCKET_ADDR(index, idx); memcpy(ptr, key, index->key_size); From 9bf352d00c3017867ca971777ad63d257df7dd91 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:09:13 +0100 Subject: [PATCH 03/10] bugfix: do not resize hashindex with wrong num_empty otherwise we would lose the decrement operation on num_empty. --- src/borg/_hashindex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index da378f45c..7eb1f5ea0 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -749,8 +749,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) } idx = start_idx; if(BUCKET_IS_EMPTY(index, idx)){ - index->num_empty--; - if(index->num_empty < index->min_empty) { + if(index->num_empty <= index->min_empty) { /* too many tombstones here / not enough empty buckets, do a same-size rebuild */ if(!hashindex_resize(index, index->num_buckets)) { return 0; @@ -763,6 +762,7 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value) assert(BUCKET_IS_EMPTY(index, start_idx)); idx = start_idx; } + index->num_empty--; } else { /* Bucket must be either EMPTY (see above) or DELETED. */ assert(BUCKET_IS_DELETED(index, idx)); From 0098ac9e6364e98737a811f4a2d654517d7aa001 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 15:33:58 +0100 Subject: [PATCH 04/10] more comments for hashindex_lookup --- src/borg/_hashindex.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 7eb1f5ea0..4e90c55ab 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -160,22 +160,24 @@ static int hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) { int didx = -1; - int start = hashindex_index(index, key); + int start = hashindex_index(index, key); /* perfect index for this key, if there is no collision. */ int idx = start; for(;;) { if(BUCKET_IS_EMPTY(index, idx)) { - break; + break; /* if we encounter an empty bucket, we do not need to look any further. */ } if(BUCKET_IS_DELETED(index, idx)) { if(didx == -1) { - didx = idx; + didx = idx; /* remember the index of the first deleted bucket. */ } } else if(BUCKET_MATCHES_KEY(index, idx, key)) { + /* we found the bucket with the key we are looking for! */ if (didx != -1) { // note: although lookup is logically a read-only operation, - // we optimize (change) the hashindex here "on the fly". + // we optimize (change) the hashindex here "on the fly": + // swap this full bucket with a previous deleted/tombstone bucket. memcpy(BUCKET_ADDR(index, didx), BUCKET_ADDR(index, idx), index->bucket_size); BUCKET_MARK_DELETED(index, idx); idx = didx; @@ -187,10 +189,16 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) idx -= index->num_buckets; } if(idx == start) { + /* we have done a full pass over all buckets. */ break; } } + /* we get here if we did not find a bucket with the key we searched for. */ if (start_idx != NULL) { + /* by giving a non-NULL pointer in start_idx, caller can request to + * get the index of the first empty or deleted bucket we encountered, + * e.g. to add a new entry for that key into that bucket. + */ (*start_idx) = (didx == -1) ? idx : didx; } return -1; From 907da00931c542eda1af39805f01080e581e4423 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:04:15 +0100 Subject: [PATCH 05/10] if HT is full with entries and tombstones: give up/fail early --- src/borg/_hashindex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 4e90c55ab..8f8b4ac9d 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -189,8 +189,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) idx -= index->num_buckets; } if(idx == start) { - /* we have done a full pass over all buckets. */ - break; + /* We have done a full pass over all buckets. + * - We did not find a bucket with the key we searched for. + * - We did not find an empty bucket either. + * So all buckets are either full or deleted/tombstones. + * This is an invalid state we never should get into, see + * upper_limit and min_empty. + */ + assert(0); /* should never happen - something is wrong here. */ } } /* we get here if we did not find a bucket with the key we searched for. */ From 4f8c4aea19d58fd86901e3b2df745ef99555e8e4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:13:30 +0100 Subject: [PATCH 06/10] implement ht idx wrap around less strangely, add comment --- src/borg/_hashindex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 8f8b4ac9d..5b973f247 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -185,8 +185,8 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) return idx; } idx++; - if (idx >= index->num_buckets) { - idx -= index->num_buckets; + if (idx >= index->num_buckets) { /* triggers at == already */ + idx = 0; } if(idx == start) { /* We have done a full pass over all buckets. From 4fc7815f1191d558a944861f1e4d0e7129af7ded Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 16:23:10 +0100 Subject: [PATCH 07/10] hashindex: always have at least 1 empty bucket avoid rounding / integer conversion issues bringing this down to 0. --- src/borg/_hashindex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 5b973f247..2e68dbefc 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -253,8 +253,10 @@ int get_upper_limit(int num_buckets){ } int get_min_empty(int num_buckets){ - /* Differently from load, the effective load also considers tombstones (deleted buckets). */ - return (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD)); + /* Differently from load, the effective load also considers tombstones (deleted buckets). + * We always add 1, so this never can return 0 (0 empty buckets would be a bad HT state). + */ + return 1 + (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD)); } int size_idx(int size){ From a13d53ec1ed25f194a72e5148abee77abf027f1e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 19:50:44 +0100 Subject: [PATCH 08/10] Simplify full HT scan assertion --- src/borg/_hashindex.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 2e68dbefc..183a2a29f 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -188,16 +188,14 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx) if (idx >= index->num_buckets) { /* triggers at == already */ idx = 0; } - if(idx == start) { - /* We have done a full pass over all buckets. - * - We did not find a bucket with the key we searched for. - * - We did not find an empty bucket either. - * So all buckets are either full or deleted/tombstones. - * This is an invalid state we never should get into, see - * upper_limit and min_empty. - */ - assert(0); /* should never happen - something is wrong here. */ - } + /* When idx == start, we have done a full pass over all buckets. + * - We did not find a bucket with the key we searched for. + * - We did not find an empty bucket either. + * So all buckets are either full or deleted/tombstones. + * This is an invalid state we never should get into, see + * upper_limit and min_empty. + */ + assert(idx != start); } /* we get here if we did not find a bucket with the key we searched for. */ if (start_idx != NULL) { From b79766a9333c97498cd1edd2bef53ff8579a0ce7 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 19:59:51 +0100 Subject: [PATCH 09/10] hashindex: simplify size_idx function Thanks to @jdchristensen for the code. --- src/borg/_hashindex.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 183a2a29f..2550a2ca7 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -258,16 +258,10 @@ int get_min_empty(int num_buckets){ } int size_idx(int size){ - /* find the hash_sizes index with entry >= size */ - int elems = NELEMS(hash_sizes); - int entry, i=0; - do{ - entry = hash_sizes[i++]; - }while((entry < size) && (i < elems)); - if (i >= elems) - return elems - 1; - i--; - return i; + /* find the smallest hash_sizes index with entry >= size */ + int i = NELEMS(hash_sizes) - 1; + while(i >= 0 && hash_sizes[i] >= size) i--; + return i + 1; } int fit_size(int current){ From 316bb7c937a8df5605840ff273bc1dc2e360e578 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 8 Feb 2023 22:45:11 +0100 Subject: [PATCH 10/10] add num_entries assertion --- src/borg/_hashindex.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c index 2550a2ca7..5c4318dac 100644 --- a/src/borg/_hashindex.c +++ b/src/borg/_hashindex.c @@ -225,6 +225,8 @@ hashindex_resize(HashIndex *index, int capacity) return 0; } } + assert(index->num_entries == new->num_entries); + hashindex_free_buckets(index); index->buckets = new->buckets; index->num_buckets = new->num_buckets;