hashindex: rebuild hashtable if we have too little empty buckets, fixes #2246

if there are too many deleted buckets (tombstones), hashtable performance goes down the drain.
in the worst case of 0 empty buckets and lots of tombstones, this results in full table scans for
new / unknown keys.
thus we make sure we always have a good amount of empty buckets.
This commit is contained in:
Thomas Waldmann 2017-03-03 15:41:08 +01:00
parent 8b5ad3819c
commit c5cd8828b1
1 changed files with 45 additions and 0 deletions

View File

@ -47,11 +47,13 @@ typedef struct {
void *buckets; void *buckets;
int num_entries; int num_entries;
int num_buckets; int num_buckets;
int num_empty;
int key_size; int key_size;
int value_size; int value_size;
off_t bucket_size; off_t bucket_size;
int lower_limit; int lower_limit;
int upper_limit; int upper_limit;
int min_empty;
} HashIndex; } HashIndex;
/* prime (or w/ big prime factors) hash table sizes /* prime (or w/ big prime factors) hash table sizes
@ -77,6 +79,7 @@ static int hash_sizes[] = {
#define HASH_MIN_LOAD .25 #define HASH_MIN_LOAD .25
#define HASH_MAX_LOAD .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ #define HASH_MAX_LOAD .75 /* don't go higher than 0.75, otherwise performance severely suffers! */
#define HASH_MAX_EFF_LOAD .93
#define MAX(x, y) ((x) > (y) ? (x): (y)) #define MAX(x, y) ((x) > (y) ? (x): (y))
#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) #define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
@ -167,8 +170,10 @@ hashindex_resize(HashIndex *index, int capacity)
free(index->buckets); free(index->buckets);
index->buckets = new->buckets; index->buckets = new->buckets;
index->num_buckets = new->num_buckets; index->num_buckets = new->num_buckets;
index->num_empty = index->num_buckets - index->num_entries;
index->lower_limit = new->lower_limit; index->lower_limit = new->lower_limit;
index->upper_limit = new->upper_limit; index->upper_limit = new->upper_limit;
index->min_empty = new->min_empty;
free(new); free(new);
return 1; return 1;
} }
@ -187,6 +192,11 @@ int get_upper_limit(int num_buckets){
return (int)(num_buckets * HASH_MAX_LOAD); return (int)(num_buckets * HASH_MAX_LOAD);
} }
int get_min_empty(int num_buckets){
/* Differently from load, the effective load also considers tombstones (deleted buckets). */
return (int)(num_buckets * (1.0 - HASH_MAX_EFF_LOAD));
}
int size_idx(int size){ int size_idx(int size){
/* find the hash_sizes index with entry >= size */ /* find the hash_sizes index with entry >= size */
int elems = NELEMS(hash_sizes); int elems = NELEMS(hash_sizes);
@ -220,6 +230,19 @@ int shrink_size(int current){
return hash_sizes[i]; return hash_sizes[i];
} }
int
count_empty(HashIndex *index)
{ /* count empty (never used) buckets. this does NOT include deleted buckets (tombstones).
* TODO: if we ever change HashHeader, save the count there so we do not need this function.
*/
int i, count = 0, capacity = index->num_buckets;
for(i = 0; i < capacity; i++) {
if(BUCKET_IS_EMPTY(index, i))
count++;
}
return count;
}
/* Public API */ /* Public API */
static HashIndex * static HashIndex *
hashindex_read(const char *path) hashindex_read(const char *path)
@ -299,6 +322,17 @@ hashindex_read(const char *path)
index->bucket_size = index->key_size + index->value_size; index->bucket_size = index->key_size + index->value_size;
index->lower_limit = get_lower_limit(index->num_buckets); index->lower_limit = get_lower_limit(index->num_buckets);
index->upper_limit = get_upper_limit(index->num_buckets); index->upper_limit = get_upper_limit(index->num_buckets);
index->min_empty = get_min_empty(index->num_buckets);
index->num_empty = count_empty(index);
if(index->num_empty < index->min_empty) {
/* too many tombstones here / not enough empty buckets, do a same-size rebuild */
if(!hashindex_resize(index, index->num_buckets)) {
free(index->buckets);
free(index);
index = NULL;
goto fail;
}
}
fail: fail:
if(fclose(fd) < 0) { if(fclose(fd) < 0) {
EPRINTF_PATH(path, "fclose failed"); EPRINTF_PATH(path, "fclose failed");
@ -326,9 +360,11 @@ hashindex_init(int capacity, int key_size, int value_size)
index->key_size = key_size; index->key_size = key_size;
index->value_size = value_size; index->value_size = value_size;
index->num_buckets = capacity; index->num_buckets = capacity;
index->num_empty = capacity;
index->bucket_size = index->key_size + index->value_size; index->bucket_size = index->key_size + index->value_size;
index->lower_limit = get_lower_limit(index->num_buckets); index->lower_limit = get_lower_limit(index->num_buckets);
index->upper_limit = get_upper_limit(index->num_buckets); index->upper_limit = get_upper_limit(index->num_buckets);
index->min_empty = get_min_empty(index->num_buckets);
for(i = 0; i < capacity; i++) { for(i = 0; i < capacity; i++) {
BUCKET_MARK_EMPTY(index, i); BUCKET_MARK_EMPTY(index, i);
} }
@ -400,6 +436,15 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) { while(!BUCKET_IS_EMPTY(index, idx) && !BUCKET_IS_DELETED(index, idx)) {
idx = (idx + 1) % index->num_buckets; idx = (idx + 1) % index->num_buckets;
} }
if(BUCKET_IS_EMPTY(index, idx)){
index->num_empty--;
if(index->num_empty < index->min_empty) {
/* too many tombstones here / not enough empty buckets, do a same-size rebuild */
if(!hashindex_resize(index, index->num_buckets)) {
return 0;
}
}
}
ptr = BUCKET_ADDR(index, idx); ptr = BUCKET_ADDR(index, idx);
memcpy(ptr, key, index->key_size); memcpy(ptr, key, index->key_size);
memcpy(ptr + index->key_size, value, index->value_size); memcpy(ptr + index->key_size, value, index->value_size);