mirror of https://github.com/borgbackup/borg.git
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it is already clear that there is not enough left (we want at least min_size big chunks). Small files are handled by same code - as they only give 1 chunk, that is the last chunk (see above). See "Cases" considerations below. For big files, we do not need to buzhash the first min_size bytes of a chunk - we do not want to cut there anyway, so we can start buzhashing at offset min_size. Cases (before this change) -------------------------- - A) remaining <= window_size - would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call - no buzhashing - result is 1 <remaining> length chunk - B) window_size < remaining <= min_size: - the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet) - would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut, because there is also the n < min_size condition - would do another chunker_fill call (line 282), but not get more data, so loop ends - result is 1 <remaining> length chunk - C) file > min_size: - normal chunking Cases (after this change) ------------------------- - A) similar to above A), but up to remaining < min_size + window_size + 1, so it does not buzhash if there is no chance for a cut. - B) see C) above
This commit is contained in:
parent
1a0277021c
commit
8834f6fdbd
|
@ -249,11 +249,12 @@ chunker_process(Chunker *c)
|
|||
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
|
||||
return NULL;
|
||||
}
|
||||
while(c->remaining <= window_size && !c->eof) {
|
||||
while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */
|
||||
if(!chunker_fill(c)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
/* here we either are at eof ... */
|
||||
if(c->eof) {
|
||||
c->done = 1;
|
||||
if(c->remaining) {
|
||||
|
@ -268,8 +269,15 @@ chunker_process(Chunker *c)
|
|||
return NULL;
|
||||
}
|
||||
}
|
||||
/* ... or we have at least min_size + window_size + 1 bytes remaining.
|
||||
* We do not want to "cut" a chunk smaller than min_size and the hash
|
||||
* window starts at the potential cutting place.
|
||||
*/
|
||||
c->position += min_size;
|
||||
c->remaining -= min_size;
|
||||
n += min_size;
|
||||
sum = buzhash(c->data + c->position, window_size, c->table);
|
||||
while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) {
|
||||
while(c->remaining > c->window_size && (sum & chunk_mask)) {
|
||||
sum = buzhash_update(sum, c->data[c->position],
|
||||
c->data[c->position + window_size],
|
||||
window_size, c->table);
|
||||
|
|
|
@ -23,6 +23,8 @@ cdef class Chunker:
|
|||
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
|
||||
min_size = 1 << chunk_min_exp
|
||||
max_size = 1 << chunk_max_exp
|
||||
# see chunker_process, first while loop condition, first term must be able to get True:
|
||||
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
|
||||
hash_mask = (1 << hash_mask_bits) - 1
|
||||
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
|
||||
|
||||
|
|
|
@ -1491,9 +1491,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
|
|||
self.cmd('create', self.repository_location + '::test', 'input')
|
||||
archive_before = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')
|
||||
with patch.object(Cache, 'add_chunk', self._test_recreate_chunker_interrupt_patch()):
|
||||
self.cmd('recreate', '-pv', '--chunker-params', '10,12,11,4095', self.repository_location)
|
||||
self.cmd('recreate', '-pv', '--chunker-params', '10,13,11,4095', self.repository_location)
|
||||
assert 'test.recreate' in self.cmd('list', self.repository_location)
|
||||
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,12,11,4095', self.repository_location)
|
||||
output = self.cmd('recreate', '-svp', '--debug', '--chunker-params', '10,13,11,4095', self.repository_location)
|
||||
assert 'Found test.recreate, will resume' in output
|
||||
assert 'Copied 1 chunks from a partially processed item' in output
|
||||
archive_after = self.cmd('list', self.repository_location + '::test', '--format', '{sha512}')
|
||||
|
|
Loading…
Reference in New Issue