chunker: do not buzhash if not needed, fixes #1021

For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).

Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).

See "Cases" considerations below.

For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.

Cases (before this change)
--------------------------

- A) remaining <= window_size

  - would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
  - no buzhashing
  - result is 1 <remaining> length chunk

- B) window_size < remaining <= min_size:

  - the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
  - would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
    because there is also the n < min_size condition
  - would do another chunker_fill call (line 282), but not get more data, so loop ends
  - result is 1 <remaining> length chunk

- C) file > min_size:

  - normal chunking

Cases (after this change)
-------------------------

- A) similar to above A), but up to remaining < min_size + window_size + 1,
  so it does not buzhash if there is no chance for a cut.

- B) see C) above
This commit is contained in:
Thomas Waldmann 2016-05-21 23:16:18 +02:00
parent 1a0277021c
commit 8834f6fdbd
3 changed files with 14 additions and 4 deletions

View File

@ -249,11 +249,12 @@ chunker_process(Chunker *c)
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
return NULL;
}
while(c->remaining <= window_size && !c->eof) {
while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */
if(!chunker_fill(c)) {
return NULL;
}
}
/* here we either are at eof ... */
if(c->eof) {
c->done = 1;
if(c->remaining) {
@ -268,8 +269,15 @@ chunker_process(Chunker *c)
return NULL;
}
}
/* ... or we have at least min_size + window_size + 1 bytes remaining.
* We do not want to "cut" a chunk smaller than min_size and the hash
* window starts at the potential cutting place.
*/
c->position += min_size;
c->remaining -= min_size;
n += min_size;
sum = buzhash(c->data + c->position, window_size, c->table);
while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) {
while(c->remaining > c->window_size && (sum & chunk_mask)) {
sum = buzhash_update(sum, c->data[c->position],
c->data[c->position + window_size],
window_size, c->table);

View File

@ -23,6 +23,8 @@ cdef class Chunker:
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
# see chunker_process, first while loop condition, first term must be able to get True:
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)

View File

@ -1491,9 +1491,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd('create', self.repository_location + '::test', 'input')
archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')
with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()):
self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location)
self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location)
assert 'test.recreate' in self.cmd('list', self.repository_location)
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location)
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location)
assert 'Found test.recreate, will resume' in output
assert 'Copied 1 chunks from a partially processed item' in output
archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')