diff --git a/borg/_hashindex.c b/borg/_hashindex.c index 16adbdfc4..9fb7266ec 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -40,13 +40,26 @@ typedef struct { int upper_limit; } HashIndex; +/* prime (or w/ big prime factors) hash table sizes - otherwise performance breaks down! */ +static int hash_sizes[] = { + 1031, 2053, 4099, 8209, 16411, 32771, 65537, 131101, 262147, 445649, + 757607, 1287917, 2189459, 3065243, 4291319, 6007867, 8410991, + 11775359, 16485527, 23079703, 27695653, 33234787, 39881729, 47858071, + 57429683, 68915617, 82698751, 99238507, 119086189, 144378011, 157223263, + 173476439, 190253911, 209915011, 230493629, 253169431, 278728861, + 306647623, 337318939, 370742809, 408229973, 449387209, 493428073, + 543105119, 596976533, 657794869, 722676499, 795815791, 874066969, + 962279771, 1057701643, 1164002657, 1280003147, 1407800297, 1548442699, + 1703765389, 1873768367, 2062383853, /* 32bit int ends about here */ +}; + #define EMPTY _htole32(0xffffffff) #define DELETED _htole32(0xfffffffe) #define MAX_BUCKET_SIZE 512 #define BUCKET_LOWER_LIMIT .25 #define BUCKET_UPPER_LIMIT .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ -#define MIN_BUCKETS 1031 /* must be prime, otherwise performance breaks down! */ #define MAX(x, y) ((x) > (y) ? (x): (y)) +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size)) #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED) @@ -207,8 +220,8 @@ hashindex_read(const char *path) index->key_size = header.key_size; index->value_size = header.value_size; index->bucket_size = index->key_size + index->value_size; - index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0; - index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT); + index->lower_limit = get_lower_limit(index->num_buckets); + index->upper_limit = get_upper_limit(index->num_buckets); fail: if(fclose(fd) < 0) { EPRINTF_PATH(path, "fclose failed"); @@ -216,12 +229,59 @@ fail: return index; } +int get_lower_limit(int num_buckets){ + int min_buckets = hash_sizes[0]; + if (num_buckets <= min_buckets) + return 0; + return (int)(num_buckets * BUCKET_LOWER_LIMIT); +} + +int get_upper_limit(int num_buckets){ + int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; + if (num_buckets >= max_buckets) + return max_buckets; + return (int)(num_buckets * BUCKET_UPPER_LIMIT); +} + +int size_idx(int size){ + /* find the hash_sizes index with entry >= size */ + int elems = NELEMS(hash_sizes); + int entry, i=0; + do{ + entry = hash_sizes[i++]; + }while((entry < size) && (i < elems)); + if (i >= elems) + return elems - 1; + i--; + return i; +} + +int fit_size(int current){ + int i = size_idx(current); + return hash_sizes[i]; +} + +int grow_size(int current){ + int i = size_idx(current) + 1; + int elems = NELEMS(hash_sizes); + if (i >= elems) + return hash_sizes[elems - 1]; + return hash_sizes[i]; +} + +int shrink_size(int current){ + int i = size_idx(current) - 1; + if (i < 0) + return hash_sizes[0]; + return hash_sizes[i]; +} + static HashIndex * hashindex_init(int capacity, int key_size, int value_size) { HashIndex *index; int i; - capacity = MAX(MIN_BUCKETS, capacity); + capacity = fit_size(capacity); if(!(index = malloc(sizeof(HashIndex)))) { EPRINTF("malloc header failed"); @@ -237,8 +297,8 @@ hashindex_init(int capacity, int key_size, int value_size) index->value_size = value_size; index->num_buckets = capacity; index->bucket_size = index->key_size + index->value_size; - index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0; - index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT); + index->lower_limit = get_lower_limit(index->num_buckets); + index->upper_limit = get_upper_limit(index->num_buckets); for(i = 0; i < capacity; i++) { BUCKET_MARK_EMPTY(index, i); } @@ -302,7 +362,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value) if(idx < 0) { if(index->num_entries > index->upper_limit) { - if(!hashindex_resize(index, index->num_buckets * 2)) { + if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } } @@ -332,7 +392,7 @@ hashindex_delete(HashIndex *index, const void *key) BUCKET_MARK_DELETED(index, idx); index->num_entries -= 1; if(index->num_entries < index->lower_limit) { - if(!hashindex_resize(index, index->num_buckets / 2)) { + if(!hashindex_resize(index, shrink_size(index->num_buckets))) { return 0; } } diff --git a/borg/hash_sizes.py b/borg/hash_sizes.py new file mode 100644 index 000000000..68e6e160a --- /dev/null +++ b/borg/hash_sizes.py @@ -0,0 +1,103 @@ +""" +Compute hashtable sizes with nices properties +- prime sizes (for small to medium sizes) +- 2 prime-factor sizes (for big sizes) +- fast growth for small sizes +- slow growth for big sizes + +Note: + this is just a tool for developers. + within borgbackup, it is just used to generate hash_sizes definition for _hashindex.c. +""" + +from collections import namedtuple + +K, M, G = 2**10, 2**20, 2**30 + +# hash table size (in number of buckets) +start, end_p1, end_p2 = 1 * K, 127 * M, 2 * G - 10 * M # stay well below 2^31 - 1 + +Policy = namedtuple("Policy", "upto grow") + +policies = [ + # which growth factor to use when growing a hashtable of size < upto + # grow fast (*2.0) at the start so we do not have to resize too often (expensive). + # grow slow (*1.1) for huge hash tables (do not jump too much in memory usage) + Policy(256*K, 2.0), + Policy(2*M, 1.7), + Policy(16*M, 1.4), + Policy(128*M, 1.2), + Policy(2*G-1, 1.1), +] + + +# slightly modified version of: +# http://www.macdevcenter.com/pub/a/python/excerpt/pythonckbk_chap1/index1.html?page=2 +def eratosthenes(): + """Yields the sequence of prime numbers via the Sieve of Eratosthenes.""" + D = {} # map each composite integer to its first-found prime factor + q = 2 # q gets 2, 3, 4, 5, ... ad infinitum + while True: + p = D.pop(q, None) + if p is None: + # q not a key in D, so q is prime, therefore, yield it + yield q + # mark q squared as not-prime (with q as first-found prime factor) + D[q * q] = q + else: + # let x <- smallest (N*p)+q which wasn't yet known to be composite + # we just learned x is composite, with p first-found prime factor, + # since p is the first-found prime factor of q -- find and mark it + x = p + q + while x in D: + x += p + D[x] = p + q += 1 + + +def two_prime_factors(pfix=65537): + """Yields numbers with 2 prime factors pfix and p.""" + for p in eratosthenes(): + yield pfix * p + + +def get_grow_factor(size): + for p in policies: + if size < p.upto: + return p.grow + + +def find_bigger_prime(gen, i): + while True: + p = next(gen) + if p >= i: + return p + + +def main(): + sizes = [] + i = start + + gen = eratosthenes() + while i < end_p1: + grow_factor = get_grow_factor(i) + p = find_bigger_prime(gen, i) + sizes.append(p) + i = int(i * grow_factor) + + gen = two_prime_factors() # for lower ram consumption + while i < end_p2: + grow_factor = get_grow_factor(i) + p = find_bigger_prime(gen, i) + sizes.append(p) + i = int(i * grow_factor) + + print("""\ +static int hash_sizes[] = { + %s +}; +""" % ', '.join(str(size) for size in sizes)) + + +if __name__ == '__main__': + main()