diff --git a/src/borg/archive.py b/src/borg/archive.py index 5581cf2fc..87ec4bc31 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import Chunker +from .chunker import get_chunker, max_chunk_size from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -242,7 +242,7 @@ class ChunkBuffer: self.packer = msgpack.Packer() self.chunks = [] self.key = key - self.chunker = Chunker(self.key.chunk_seed, *chunker_params) + self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed) def add(self, item): self.buffer.write(self.packer.pack(item.as_dict())) @@ -610,7 +610,7 @@ Utilization of max. archive size: {csize_max:.0%} if hardlink_set: return if sparse and self.zeros is None: - self.zeros = b'\0' * (1 << self.chunker_params[1]) + self.zeros = b'\0' * max_chunk_size(*self.chunker_params) with backup_io('open'): fd = open(path, 'wb') with fd: @@ -1058,7 +1058,7 @@ class FilesystemObjectProcessors: self.hard_links = {} self.stats = Statistics() # threading: done by cache (including progress) self.cwd = os.getcwd() - self.chunker = Chunker(key.chunk_seed, *chunker_params) + self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed) @contextmanager def create_helper(self, path, st, status=None, hardlinkable=True): @@ -1920,6 +1920,9 @@ class ArchiveRecreater: target = self.create_target_archive(target_name) # If the archives use the same chunker params, then don't rechunkify source_chunker_params = tuple(archive.metadata.get('chunker_params', [])) + if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int): + # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash: + source_chunker_params = ('buzhash', ) + source_chunker_params target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params if target.recreate_rechunkify: logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params) @@ -1927,7 +1930,7 @@ class ArchiveRecreater: cache=self.cache, key=self.key, add_item=target.add_item, write_checkpoint=target.write_checkpoint, checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks - target.chunker = Chunker(self.key.chunk_seed, *target.chunker_params) + target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed) return target def create_target_archive(self, name): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 395189264..207585e24 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -3151,8 +3151,8 @@ class Archiver: help='write checkpoint every SECONDS seconds (Default: 1800)') archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params', type=ChunkerParams, default=CHUNKER_PARAMS, - help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' - 'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS) + help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' + 'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS) archive_group.add_argument('-C', '--compression', metavar='COMPRESSION', dest='compression', type=CompressionSpec, default=CompressionSpec('lz4'), help='select compression algorithm, see the output of the ' @@ -3768,9 +3768,9 @@ class Archiver: 'do not recompress.') archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params', type=ChunkerParams, default=CHUNKER_PARAMS, - help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' + help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' 'HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the current defaults. ' - 'default: %d,%d,%d,%d' % CHUNKER_PARAMS) + 'default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS) subparser.add_argument('location', metavar='REPOSITORY_OR_ARCHIVE', nargs='?', default='', type=location_validator(), diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index d2b44f686..5558155e1 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -API_VERSION = '1.1_01' +API_VERSION = '1.1_02' from libc.stdlib cimport free @@ -18,6 +18,17 @@ cdef extern from "_chunker.c": cdef class Chunker: + """ + Content-Defined Chunker, variable chunk sizes. + + This chunker does quite some effort to mostly cut the same-content chunks, even if + the content moves to a different offset inside the file. It uses the buzhash + rolling-hash algorithm to identify the chunk cutting places by looking at the + content inside the moving window and computing the rolling hash value over the + window contents. If the last n bits of the rolling hash are 0, a chunk is cut. + Additionally it obeys some more criteria, like a minimum and maximum chunk size. + It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. + """ cdef _Chunker *chunker def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size): @@ -50,6 +61,20 @@ cdef class Chunker: return chunker_process(self.chunker) +def get_chunker(algo, *params, **kw): + if algo == 'buzhash': + seed = kw['seed'] + return Chunker(seed, *params) + raise TypeError('unsupported chunker algo %r' % algo) + + +def max_chunk_size(algo, *params): + # see also parseformat.ChunkerParams return values + if algo == 'buzhash': + return 1 << params[1] + raise TypeError('unsupported chunker algo %r' % algo) + + def buzhash(data, unsigned long seed): cdef uint32_t *table cdef uint32_t sum diff --git a/src/borg/constants.py b/src/borg/constants.py index a2fce0954..7055a44e5 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -60,10 +60,10 @@ HASH_WINDOW_SIZE = 0xfff # 4095B HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically # defaults, use --chunker-params to override -CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) +CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) # chunker params for the items metadata stream, finer granularity -ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE) +ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE) # operating mode of the files cache (for fast skipping of unchanged files) DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode' diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py index 2bd2883ed..f52e0ede3 100644 --- a/src/borg/helpers/checks.py +++ b/src/borg/helpers/checks.py @@ -27,7 +27,7 @@ def check_extension_modules(): from .. import platform, compress, item, chunker, hashindex if hashindex.API_VERSION != '1.1_07': raise ExtensionModuleError - if chunker.API_VERSION != '1.1_01': + if chunker.API_VERSION != '1.1_02': raise ExtensionModuleError if compress.API_VERSION != '1.1_06': raise ExtensionModuleError @@ -35,5 +35,5 @@ def check_extension_modules(): raise ExtensionModuleError if platform.API_VERSION != platform.OS_API_VERSION or platform.API_VERSION != '1.2_03': raise ExtensionModuleError - if item.API_VERSION != '1.1_03': + if item.API_VERSION != '1.1_04': raise ExtensionModuleError diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index a407763bc..f2291f683 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -108,12 +108,20 @@ def timestamp(s): def ChunkerParams(s): - if s.strip().lower() == "default": + params = s.strip().split(',') + count = len(params) + if count == 0: + raise ValueError('no chunker params given') + algo = params[0].lower() + if algo == 'default' and count == 1: # default return CHUNKER_PARAMS - chunk_min, chunk_max, chunk_mask, window_size = s.split(',') - if int(chunk_max) > 23: - raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)') - return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size) + # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): + if algo == 'buzhash' and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size + chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]] + if chunk_max > 23: + raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)') + return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size + raise ValueError('invalid chunker params') def FilesCacheMode(s): diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 5e477e98f..754896114 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -12,7 +12,7 @@ cdef extern from "_item.c": object _optr_to_object(object bytes) -API_VERSION = '1.1_03' +API_VERSION = '1.1_04' class PropDict: @@ -325,6 +325,18 @@ class Key(PropDict): tam_required = PropDict._make_property('tam_required', bool) +def tuple_encode(t): + """encode a tuple that might contain str items""" + # we have str, but want to give bytes to msgpack.pack + return tuple(safe_encode(e) if isinstance(e, str) else e for e in t) + + +def tuple_decode(t): + """decode a tuple that might contain bytes items""" + # we get bytes objects from msgpack.unpack, but want str + return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t) + + class ArchiveItem(PropDict): """ ArchiveItem abstraction that deals with validation and the low-level details internally: @@ -353,7 +365,7 @@ class ArchiveItem(PropDict): time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - chunker_params = PropDict._make_property('chunker_params', tuple) + chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode) recreate_source_id = PropDict._make_property('recreate_source_id', bytes) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index 2a14bd604..3d56fea60 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -1,6 +1,6 @@ from io import BytesIO -from ..chunker import Chunker, buzhash, buzhash_update +from ..chunker import Chunker, get_chunker, buzhash, buzhash_update from ..constants import * # NOQA from . import BaseTestCase @@ -41,5 +41,6 @@ class ChunkerTestCase(BaseTestCase): self.input = self.input[:-1] return self.input[:1] - reconstructed = b''.join(Chunker(0, *CHUNKER_PARAMS).chunkify(SmallReadFile())) + chunker = get_chunker(*CHUNKER_PARAMS, seed=0) + reconstructed = b''.join(chunker.chunkify(SmallReadFile())) assert reconstructed == b'a' * 20 diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 2d329c075..400d168dd 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -309,10 +309,14 @@ class FormatTimedeltaTestCase(BaseTestCase): def test_chunkerparams(): - assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095) - assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095) + assert ChunkerParams('default') == ('buzhash', 19, 23, 21, 4095) + assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095) + assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095) + assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095) with pytest.raises(ValueError): ChunkerParams('19,24,21,4095') + with pytest.raises(ValueError): + ChunkerParams('crap,1,2,3,4') class MakePathSafeTestCase(BaseTestCase):