From d88df3edc645d9048ec4647d5c889d2bc2a7e335 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 9 Jan 2016 16:07:54 +0100 Subject: [PATCH 1/6] hashtable size follows a growth policy, fixes #527 also: refactor / dedupe some code into functions --- borg/_hashindex.c | 76 +++++++++++++++++++++++++++++---- borg/hash_sizes.py | 103 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 8 deletions(-) create mode 100644 borg/hash_sizes.py diff --git a/borg/_hashindex.c b/borg/_hashindex.c index 16adbdfc4..9fb7266ec 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -40,13 +40,26 @@ typedef struct { int upper_limit; } HashIndex; +/* prime (or w/ big prime factors) hash table sizes - otherwise performance breaks down! */ +static int hash_sizes[] = { + 1031, 2053, 4099, 8209, 16411, 32771, 65537, 131101, 262147, 445649, + 757607, 1287917, 2189459, 3065243, 4291319, 6007867, 8410991, + 11775359, 16485527, 23079703, 27695653, 33234787, 39881729, 47858071, + 57429683, 68915617, 82698751, 99238507, 119086189, 144378011, 157223263, + 173476439, 190253911, 209915011, 230493629, 253169431, 278728861, + 306647623, 337318939, 370742809, 408229973, 449387209, 493428073, + 543105119, 596976533, 657794869, 722676499, 795815791, 874066969, + 962279771, 1057701643, 1164002657, 1280003147, 1407800297, 1548442699, + 1703765389, 1873768367, 2062383853, /* 32bit int ends about here */ +}; + #define EMPTY _htole32(0xffffffff) #define DELETED _htole32(0xfffffffe) #define MAX_BUCKET_SIZE 512 #define BUCKET_LOWER_LIMIT .25 #define BUCKET_UPPER_LIMIT .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ -#define MIN_BUCKETS 1031 /* must be prime, otherwise performance breaks down! */ #define MAX(x, y) ((x) > (y) ? (x): (y)) +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size)) #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED) @@ -207,8 +220,8 @@ hashindex_read(const char *path) index->key_size = header.key_size; index->value_size = header.value_size; index->bucket_size = index->key_size + index->value_size; - index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0; - index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT); + index->lower_limit = get_lower_limit(index->num_buckets); + index->upper_limit = get_upper_limit(index->num_buckets); fail: if(fclose(fd) < 0) { EPRINTF_PATH(path, "fclose failed"); @@ -216,12 +229,59 @@ fail: return index; } +int get_lower_limit(int num_buckets){ + int min_buckets = hash_sizes[0]; + if (num_buckets <= min_buckets) + return 0; + return (int)(num_buckets * BUCKET_LOWER_LIMIT); +} + +int get_upper_limit(int num_buckets){ + int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; + if (num_buckets >= max_buckets) + return max_buckets; + return (int)(num_buckets * BUCKET_UPPER_LIMIT); +} + +int size_idx(int size){ + /* find the hash_sizes index with entry >= size */ + int elems = NELEMS(hash_sizes); + int entry, i=0; + do{ + entry = hash_sizes[i++]; + }while((entry < size) && (i < elems)); + if (i >= elems) + return elems - 1; + i--; + return i; +} + +int fit_size(int current){ + int i = size_idx(current); + return hash_sizes[i]; +} + +int grow_size(int current){ + int i = size_idx(current) + 1; + int elems = NELEMS(hash_sizes); + if (i >= elems) + return hash_sizes[elems - 1]; + return hash_sizes[i]; +} + +int shrink_size(int current){ + int i = size_idx(current) - 1; + if (i < 0) + return hash_sizes[0]; + return hash_sizes[i]; +} + static HashIndex * hashindex_init(int capacity, int key_size, int value_size) { HashIndex *index; int i; - capacity = MAX(MIN_BUCKETS, capacity); + capacity = fit_size(capacity); if(!(index = malloc(sizeof(HashIndex)))) { EPRINTF("malloc header failed"); @@ -237,8 +297,8 @@ hashindex_init(int capacity, int key_size, int value_size) index->value_size = value_size; index->num_buckets = capacity; index->bucket_size = index->key_size + index->value_size; - index->lower_limit = index->num_buckets > MIN_BUCKETS ? ((int)(index->num_buckets * BUCKET_LOWER_LIMIT)) : 0; - index->upper_limit = (int)(index->num_buckets * BUCKET_UPPER_LIMIT); + index->lower_limit = get_lower_limit(index->num_buckets); + index->upper_limit = get_upper_limit(index->num_buckets); for(i = 0; i < capacity; i++) { BUCKET_MARK_EMPTY(index, i); } @@ -302,7 +362,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value) if(idx < 0) { if(index->num_entries > index->upper_limit) { - if(!hashindex_resize(index, index->num_buckets * 2)) { + if(!hashindex_resize(index, grow_size(index->num_buckets))) { return 0; } } @@ -332,7 +392,7 @@ hashindex_delete(HashIndex *index, const void *key) BUCKET_MARK_DELETED(index, idx); index->num_entries -= 1; if(index->num_entries < index->lower_limit) { - if(!hashindex_resize(index, index->num_buckets / 2)) { + if(!hashindex_resize(index, shrink_size(index->num_buckets))) { return 0; } } diff --git a/borg/hash_sizes.py b/borg/hash_sizes.py new file mode 100644 index 000000000..68e6e160a --- /dev/null +++ b/borg/hash_sizes.py @@ -0,0 +1,103 @@ +""" +Compute hashtable sizes with nices properties +- prime sizes (for small to medium sizes) +- 2 prime-factor sizes (for big sizes) +- fast growth for small sizes +- slow growth for big sizes + +Note: + this is just a tool for developers. + within borgbackup, it is just used to generate hash_sizes definition for _hashindex.c. +""" + +from collections import namedtuple + +K, M, G = 2**10, 2**20, 2**30 + +# hash table size (in number of buckets) +start, end_p1, end_p2 = 1 * K, 127 * M, 2 * G - 10 * M # stay well below 2^31 - 1 + +Policy = namedtuple("Policy", "upto grow") + +policies = [ + # which growth factor to use when growing a hashtable of size < upto + # grow fast (*2.0) at the start so we do not have to resize too often (expensive). + # grow slow (*1.1) for huge hash tables (do not jump too much in memory usage) + Policy(256*K, 2.0), + Policy(2*M, 1.7), + Policy(16*M, 1.4), + Policy(128*M, 1.2), + Policy(2*G-1, 1.1), +] + + +# slightly modified version of: +# http://www.macdevcenter.com/pub/a/python/excerpt/pythonckbk_chap1/index1.html?page=2 +def eratosthenes(): + """Yields the sequence of prime numbers via the Sieve of Eratosthenes.""" + D = {} # map each composite integer to its first-found prime factor + q = 2 # q gets 2, 3, 4, 5, ... ad infinitum + while True: + p = D.pop(q, None) + if p is None: + # q not a key in D, so q is prime, therefore, yield it + yield q + # mark q squared as not-prime (with q as first-found prime factor) + D[q * q] = q + else: + # let x <- smallest (N*p)+q which wasn't yet known to be composite + # we just learned x is composite, with p first-found prime factor, + # since p is the first-found prime factor of q -- find and mark it + x = p + q + while x in D: + x += p + D[x] = p + q += 1 + + +def two_prime_factors(pfix=65537): + """Yields numbers with 2 prime factors pfix and p.""" + for p in eratosthenes(): + yield pfix * p + + +def get_grow_factor(size): + for p in policies: + if size < p.upto: + return p.grow + + +def find_bigger_prime(gen, i): + while True: + p = next(gen) + if p >= i: + return p + + +def main(): + sizes = [] + i = start + + gen = eratosthenes() + while i < end_p1: + grow_factor = get_grow_factor(i) + p = find_bigger_prime(gen, i) + sizes.append(p) + i = int(i * grow_factor) + + gen = two_prime_factors() # for lower ram consumption + while i < end_p2: + grow_factor = get_grow_factor(i) + p = find_bigger_prime(gen, i) + sizes.append(p) + i = int(i * grow_factor) + + print("""\ +static int hash_sizes[] = { + %s +}; +""" % ', '.join(str(size) for size in sizes)) + + +if __name__ == '__main__': + main() From 91cde721b4158f191936aceaace13bf3080f9599 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 9 Jan 2016 16:16:41 +0100 Subject: [PATCH 2/6] hashindex: minor refactor - rename BUCKET_(LOWER|UPPER)_LIMIT to HASH_(MIN|MAX)_LOAD as this value is usually called the hash table's minimum/maximum load factor. - remove MAX_BUCKET_SIZE (not used) - regroup/reorder definitions --- borg/_hashindex.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/borg/_hashindex.c b/borg/_hashindex.c index 9fb7266ec..b4db4fdaf 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -53,20 +53,22 @@ static int hash_sizes[] = { 1703765389, 1873768367, 2062383853, /* 32bit int ends about here */ }; -#define EMPTY _htole32(0xffffffff) -#define DELETED _htole32(0xfffffffe) -#define MAX_BUCKET_SIZE 512 -#define BUCKET_LOWER_LIMIT .25 -#define BUCKET_UPPER_LIMIT .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ +#define HASH_MIN_LOAD .25 +#define HASH_MAX_LOAD .75 /* don't go higher than 0.75, otherwise performance severely suffers! */ + #define MAX(x, y) ((x) > (y) ? (x): (y)) #define NELEMS(x) (sizeof(x) / sizeof((x)[0])) + +#define EMPTY _htole32(0xffffffff) +#define DELETED _htole32(0xfffffffe) + #define BUCKET_ADDR(index, idx) (index->buckets + (idx * index->bucket_size)) +#define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0) + #define BUCKET_IS_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == DELETED) #define BUCKET_IS_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) == EMPTY) -#define BUCKET_MATCHES_KEY(index, idx, key) (memcmp(key, BUCKET_ADDR(index, idx), index->key_size) == 0) - #define BUCKET_MARK_DELETED(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = DELETED) #define BUCKET_MARK_EMPTY(index, idx) (*((uint32_t *)(BUCKET_ADDR(index, idx) + index->key_size)) = EMPTY) @@ -233,14 +235,14 @@ int get_lower_limit(int num_buckets){ int min_buckets = hash_sizes[0]; if (num_buckets <= min_buckets) return 0; - return (int)(num_buckets * BUCKET_LOWER_LIMIT); + return (int)(num_buckets * HASH_MIN_LOAD); } int get_upper_limit(int num_buckets){ int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; if (num_buckets >= max_buckets) return max_buckets; - return (int)(num_buckets * BUCKET_UPPER_LIMIT); + return (int)(num_buckets * HASH_MAX_LOAD); } int size_idx(int size){ From 09665805e8b77229056ce90149b144c2e583b45c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 9 Jan 2016 17:27:45 +0100 Subject: [PATCH 3/6] move func defs to avoid implicit declaration compiler warning --- borg/_hashindex.c | 94 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/borg/_hashindex.c b/borg/_hashindex.c index b4db4fdaf..6dde62fe3 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -145,6 +145,53 @@ hashindex_resize(HashIndex *index, int capacity) return 1; } +int get_lower_limit(int num_buckets){ + int min_buckets = hash_sizes[0]; + if (num_buckets <= min_buckets) + return 0; + return (int)(num_buckets * HASH_MIN_LOAD); +} + +int get_upper_limit(int num_buckets){ + int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; + if (num_buckets >= max_buckets) + return max_buckets; + return (int)(num_buckets * HASH_MAX_LOAD); +} + +int size_idx(int size){ + /* find the hash_sizes index with entry >= size */ + int elems = NELEMS(hash_sizes); + int entry, i=0; + do{ + entry = hash_sizes[i++]; + }while((entry < size) && (i < elems)); + if (i >= elems) + return elems - 1; + i--; + return i; +} + +int fit_size(int current){ + int i = size_idx(current); + return hash_sizes[i]; +} + +int grow_size(int current){ + int i = size_idx(current) + 1; + int elems = NELEMS(hash_sizes); + if (i >= elems) + return hash_sizes[elems - 1]; + return hash_sizes[i]; +} + +int shrink_size(int current){ + int i = size_idx(current) - 1; + if (i < 0) + return hash_sizes[0]; + return hash_sizes[i]; +} + /* Public API */ static HashIndex * hashindex_read(const char *path) @@ -231,53 +278,6 @@ fail: return index; } -int get_lower_limit(int num_buckets){ - int min_buckets = hash_sizes[0]; - if (num_buckets <= min_buckets) - return 0; - return (int)(num_buckets * HASH_MIN_LOAD); -} - -int get_upper_limit(int num_buckets){ - int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; - if (num_buckets >= max_buckets) - return max_buckets; - return (int)(num_buckets * HASH_MAX_LOAD); -} - -int size_idx(int size){ - /* find the hash_sizes index with entry >= size */ - int elems = NELEMS(hash_sizes); - int entry, i=0; - do{ - entry = hash_sizes[i++]; - }while((entry < size) && (i < elems)); - if (i >= elems) - return elems - 1; - i--; - return i; -} - -int fit_size(int current){ - int i = size_idx(current); - return hash_sizes[i]; -} - -int grow_size(int current){ - int i = size_idx(current) + 1; - int elems = NELEMS(hash_sizes); - if (i >= elems) - return hash_sizes[elems - 1]; - return hash_sizes[i]; -} - -int shrink_size(int current){ - int i = size_idx(current) - 1; - if (i < 0) - return hash_sizes[0]; - return hash_sizes[i]; -} - static HashIndex * hashindex_init(int capacity, int key_size, int value_size) { From 083f5e31efd082aa738f374680756ba57f25a3db Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2016 03:20:17 +0100 Subject: [PATCH 4/6] hashindex: fix upper limit use num_buckets (== fully use what we currently have allocated) --- borg/_hashindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/borg/_hashindex.c b/borg/_hashindex.c index 6dde62fe3..247454d26 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -155,7 +155,7 @@ int get_lower_limit(int num_buckets){ int get_upper_limit(int num_buckets){ int max_buckets = hash_sizes[NELEMS(hash_sizes) - 1]; if (num_buckets >= max_buckets) - return max_buckets; + return num_buckets; return (int)(num_buckets * HASH_MAX_LOAD); } From 5cb47cbedda7e4800679d36ddb416b9b7e40bb51 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 14 Jan 2016 03:56:12 +0100 Subject: [PATCH 5/6] hashindex: explain hash_sizes --- borg/_hashindex.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/borg/_hashindex.c b/borg/_hashindex.c index 247454d26..f1aa0aa8c 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -40,7 +40,15 @@ typedef struct { int upper_limit; } HashIndex; -/* prime (or w/ big prime factors) hash table sizes - otherwise performance breaks down! */ +/* prime (or w/ big prime factors) hash table sizes + * not sure we need primes for borg's usage (as we have a hash function based + * on sha256, we can assume an even, seemingly random distribution of values), + * but OTOH primes don't harm. + * also, growth of the sizes starts with fast-growing 2x steps, but slows down + * more and more down to 1.1x. this is to avoid huge jumps in memory allocation, + * like e.g. 4G -> 8G. + * these values are generated by hash_sizes.py. + */ static int hash_sizes[] = { 1031, 2053, 4099, 8209, 16411, 32771, 65537, 131101, 262147, 445649, 757607, 1287917, 2189459, 3065243, 4291319, 6007867, 8410991, From ac9d2964a0ed45d74a52a8ddf42b282dea660adf Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 15 Jan 2016 19:52:19 +0100 Subject: [PATCH 6/6] exclude hash_sizes.py from coverage testing this is a one-time tool for developers to generate a value table for borg. the tool is not used at borg runtime. --- .coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/.coveragerc b/.coveragerc index 077e3e6f4..e2e8fe401 100644 --- a/.coveragerc +++ b/.coveragerc @@ -8,6 +8,7 @@ omit = */borg/fuse.py */borg/support/* */borg/testsuite/* + */borg/hash_sizes.py [report] exclude_lines =