prepare to support multiple chunkers

2019-01-05 04:38:06 +01:00 · 2019-01-05 04:38:06 +01:00 · c4ffbd2a17
parent de55d763a4
commit c4ffbd2a17
9 changed files with 78 additions and 25 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()

 from . import xattr
-from .chunker import Chunker
+from .chunker import get_chunker, max_chunk_size
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@ -242,7 +242,7 @@ class ChunkBuffer:
        self.packer = msgpack.Packer()
        self.chunks = []
        self.key = key
-        self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
+        self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)

    def add(self, item):
        self.buffer.write(self.packer.pack(item.as_dict()))
@ -610,7 +610,7 @@ Utilization of max. archive size: {csize_max:.0%}
                if hardlink_set:
                    return
                if sparse and self.zeros is None:
-                    self.zeros = b'\0' * (1 << self.chunker_params[1])
+                    self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
                with backup_io('open'):
                    fd = open(path, 'wb')
                with fd:
@ -1058,7 +1058,7 @@ class FilesystemObjectProcessors:
        self.hard_links = {}
        self.stats = Statistics()  # threading: done by cache (including progress)
        self.cwd = os.getcwd()
-        self.chunker = Chunker(key.chunk_seed, *chunker_params)
+        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)

    @contextmanager
    def create_helper(self, path, st, status=None, hardlinkable=True):
@ -1920,6 +1920,9 @@ class ArchiveRecreater:
        target = self.create_target_archive(target_name)
        # If the archives use the same chunker params, then don't rechunkify
        source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
+        if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int):
+            # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
+            source_chunker_params = ('buzhash', ) + source_chunker_params
        target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
        if target.recreate_rechunkify:
            logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
@ -1927,7 +1930,7 @@ class ArchiveRecreater:
            cache=self.cache, key=self.key,
            add_item=target.add_item, write_checkpoint=target.write_checkpoint,
            checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
-        target.chunker = Chunker(self.key.chunk_seed, *target.chunker_params)
+        target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
        return target

    def create_target_archive(self, name):
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -3151,8 +3151,8 @@ class Archiver:
                                   help='write checkpoint every SECONDS seconds (Default: 1800)')
        archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
                                   type=ChunkerParams, default=CHUNKER_PARAMS,
-                                   help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
-                                        'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
+                                   help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
+                                        'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
        archive_group.add_argument('-C', '--compression', metavar='COMPRESSION', dest='compression',
                                   type=CompressionSpec, default=CompressionSpec('lz4'),
                                   help='select compression algorithm, see the output of the '
@ -3768,9 +3768,9 @@ class Archiver:
                                        'do not recompress.')
        archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
                                   type=ChunkerParams, default=CHUNKER_PARAMS,
-                                   help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
+                                   help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
                                        'HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the current defaults. '
-                                        'default: %d,%d,%d,%d' % CHUNKER_PARAMS)
+                                        'default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)

        subparser.add_argument('location', metavar='REPOSITORY_OR_ARCHIVE', nargs='?', default='',
                               type=location_validator(),
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-API_VERSION = '1.1_01'
+API_VERSION = '1.1_02'

 from libc.stdlib cimport free

@ -18,6 +18,17 @@ cdef extern from "_chunker.c":


 cdef class Chunker:
+    """
+    Content-Defined Chunker, variable chunk sizes.
+
+    This chunker does quite some effort to mostly cut the same-content chunks, even if
+    the content moves to a different offset inside the file. It uses the buzhash
+    rolling-hash algorithm to identify the chunk cutting places by looking at the
+    content inside the moving window and computing the rolling hash value over the
+    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
+    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
+    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
+    """
    cdef _Chunker *chunker

    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
@ -50,6 +61,20 @@ cdef class Chunker:
        return chunker_process(self.chunker)


+def get_chunker(algo, *params, **kw):
+    if algo == 'buzhash':
+        seed = kw['seed']
+        return Chunker(seed, *params)
+    raise TypeError('unsupported chunker algo %r' % algo)
+
+
+def max_chunk_size(algo, *params):
+    # see also parseformat.ChunkerParams return values
+    if algo == 'buzhash':
+        return 1 << params[1]
+    raise TypeError('unsupported chunker algo %r' % algo)
+
+
 def buzhash(data, unsigned long seed):
    cdef uint32_t *table
    cdef uint32_t sum
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@ -60,10 +60,10 @@ HASH_WINDOW_SIZE = 0xfff  # 4095B
 HASH_MASK_BITS = 21  # results in ~2MiB chunks statistically

 # defaults, use --chunker-params to override
-CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
+CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)

 # chunker params for the items metadata stream, finer granularity
-ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE)
+ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE)

 # operating mode of the files cache (for fast skipping of unchanged files)
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
--- a/src/borg/helpers/checks.py
+++ b/src/borg/helpers/checks.py
@ -27,7 +27,7 @@ def check_extension_modules():
    from .. import platform, compress, item, chunker, hashindex
    if hashindex.API_VERSION != '1.1_07':
        raise ExtensionModuleError
-    if chunker.API_VERSION != '1.1_01':
+    if chunker.API_VERSION != '1.1_02':
        raise ExtensionModuleError
    if compress.API_VERSION != '1.1_06':
        raise ExtensionModuleError
@ -35,5 +35,5 @@ def check_extension_modules():
        raise ExtensionModuleError
    if platform.API_VERSION != platform.OS_API_VERSION or platform.API_VERSION != '1.2_03':
        raise ExtensionModuleError
-    if item.API_VERSION != '1.1_03':
+    if item.API_VERSION != '1.1_04':
        raise ExtensionModuleError
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@ -108,12 +108,20 @@ def timestamp(s):


 def ChunkerParams(s):
-    if s.strip().lower() == "default":
+    params = s.strip().split(',')
+    count = len(params)
+    if count == 0:
+        raise ValueError('no chunker params given')
+    algo = params[0].lower()
+    if algo == 'default' and count == 1:  # default
        return CHUNKER_PARAMS
-    chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
-    if int(chunk_max) > 23:
-        raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
-    return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
+    # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
+    if algo == 'buzhash' and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
+        chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
+        if chunk_max > 23:
+            raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
+        return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
+    raise ValueError('invalid chunker params')


 def FilesCacheMode(s):
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@ -12,7 +12,7 @@ cdef extern from "_item.c":
    object _optr_to_object(object bytes)


-API_VERSION = '1.1_03'
+API_VERSION = '1.1_04'


 class PropDict:
@ -325,6 +325,18 @@ class Key(PropDict):
    tam_required = PropDict._make_property('tam_required', bool)


+def tuple_encode(t):
+    """encode a tuple that might contain str items"""
+    # we have str, but want to give bytes to msgpack.pack
+    return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
+
+
+def tuple_decode(t):
+    """decode a tuple that might contain bytes items"""
+    # we get bytes objects from msgpack.unpack, but want str
+    return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
+
+
 class ArchiveItem(PropDict):
    """
    ArchiveItem abstraction that deals with validation and the low-level details internally:
@ -353,7 +365,7 @@ class ArchiveItem(PropDict):
    time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
    time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
    comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    chunker_params = PropDict._make_property('chunker_params', tuple)
+    chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
    recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
    recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
    recreate_args = PropDict._make_property('recreate_args', list)  # list of s-e-str
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@ -1,6 +1,6 @@
 from io import BytesIO

-from ..chunker import Chunker, buzhash, buzhash_update
+from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
 from ..constants import *  # NOQA
 from . import BaseTestCase

@ -41,5 +41,6 @@ class ChunkerTestCase(BaseTestCase):
                self.input = self.input[:-1]
                return self.input[:1]

-        reconstructed = b''.join(Chunker(0, *CHUNKER_PARAMS).chunkify(SmallReadFile()))
+        chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
+        reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
        assert reconstructed == b'a' * 20
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@ -309,10 +309,14 @@ class FormatTimedeltaTestCase(BaseTestCase):


 def test_chunkerparams():
-    assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
-    assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
+    assert ChunkerParams('default') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
    with pytest.raises(ValueError):
        ChunkerParams('19,24,21,4095')
+    with pytest.raises(ValueError):
+        ChunkerParams('crap,1,2,3,4')


 class MakePathSafeTestCase(BaseTestCase):