diff --git a/src/borg/_chunker.c b/src/borg/_chunker.c index 20941cb07..75599c5b1 100644 --- a/src/borg/_chunker.c +++ b/src/borg/_chunker.c @@ -247,7 +247,7 @@ static PyObject * chunker_process(Chunker *c) { uint32_t sum, chunk_mask = c->chunk_mask; - size_t n = 0, old_last, min_size = c->min_size, window_size = c->window_size; + size_t n, old_last, min_size = c->min_size, window_size = c->window_size; if(c->done) { if(c->bytes_read == c->bytes_yielded) @@ -282,15 +282,19 @@ chunker_process(Chunker *c) */ c->position += min_size; c->remaining -= min_size; - n += min_size; sum = buzhash(c->data + c->position, window_size, c->table); while(c->remaining > c->window_size && (sum & chunk_mask)) { - sum = buzhash_update(sum, c->data[c->position], - c->data[c->position + window_size], - window_size, c->table); - c->position++; - c->remaining--; - n++; + uint8_t *p = c->data + c->position; + uint8_t *stop_at = p + c->remaining - window_size; + size_t did_bytes; + while (p < stop_at && (sum & chunk_mask)) { + sum = buzhash_update(sum, p[0], p[window_size], + window_size, c->table); + p++; + } + did_bytes = p - (c->data + c->position); + c->position += did_bytes; + c->remaining -= did_bytes; if(c->remaining <= window_size) { if(!chunker_fill(c)) { return NULL; diff --git a/src/borg/testsuite/chunker_slow.py b/src/borg/testsuite/chunker_slow.py new file mode 100644 index 000000000..2739a735a --- /dev/null +++ b/src/borg/testsuite/chunker_slow.py @@ -0,0 +1,39 @@ +from io import BytesIO +from binascii import unhexlify + +from ..chunker import Chunker +from ..crypto.low_level import blake2b_256 +from ..constants import * # NOQA +from . import BaseTestCase + + +class ChunkerRegressionTestCase(BaseTestCase): + + def test_chunkpoints_unchanged(self): + def twist(size): + x = 1 + a = bytearray(size) + for i in range(size): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return a + + data = twist(100000) + + runs = [] + for winsize in (65, 129, HASH_WINDOW_SIZE, 7351): + for minexp in (4, 6, 7, 11, 12): + for maxexp in (15, 17): + if minexp >= maxexp: + continue + for maskbits in (4, 7, 10, 12): + for seed in (1849058162, 1234567653): + fh = BytesIO(data) + chunker = Chunker(seed, minexp, maxexp, maskbits, winsize) + chunks = [blake2b_256(b'', c) for c in chunker.chunkify(fh, -1)] + runs.append(blake2b_256(b'', b''.join(chunks))) + + # The "correct" hash below matches the existing chunker behavior. + # Future chunker optimisations must not change this, or existing repos will bloat. + overall_hash = blake2b_256(b'', b''.join(runs)) + self.assert_equal(overall_hash, unhexlify("b559b0ac8df8daaa221201d018815114241ea5c6609d98913cd2246a702af4e3"))