prepare to support multiple chunkers

This commit is contained in:
Thomas Waldmann 2019-01-05 04:38:06 +01:00
parent de55d763a4
commit c4ffbd2a17
9 changed files with 78 additions and 25 deletions

View File

@ -19,7 +19,7 @@ from .logger import create_logger
logger = create_logger()
from . import xattr
from .chunker import Chunker
from .chunker import get_chunker, max_chunk_size
from .cache import ChunkListEntry
from .crypto.key import key_factory
from .compress import Compressor, CompressionSpec
@ -242,7 +242,7 @@ class ChunkBuffer:
self.packer = msgpack.Packer()
self.chunks = []
self.key = key
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
def add(self, item):
self.buffer.write(self.packer.pack(item.as_dict()))
@ -610,7 +610,7 @@ Utilization of max. archive size: {csize_max:.0%}
if hardlink_set:
return
if sparse and self.zeros is None:
self.zeros = b'\0' * (1 << self.chunker_params[1])
self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
with backup_io('open'):
fd = open(path, 'wb')
with fd:
@ -1058,7 +1058,7 @@ class FilesystemObjectProcessors:
self.hard_links = {}
self.stats = Statistics() # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = Chunker(key.chunk_seed, *chunker_params)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
@ -1920,6 +1920,9 @@ class ArchiveRecreater:
target = self.create_target_archive(target_name)
# If the archives use the same chunker params, then don't rechunkify
source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int):
# this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
source_chunker_params = ('buzhash', ) + source_chunker_params
target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
if target.recreate_rechunkify:
logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
@ -1927,7 +1930,7 @@ class ArchiveRecreater:
cache=self.cache, key=self.key,
add_item=target.add_item, write_checkpoint=target.write_checkpoint,
checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
target.chunker = Chunker(self.key.chunk_seed, *target.chunker_params)
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
return target
def create_target_archive(self, name):

View File

@ -3151,8 +3151,8 @@ class Archiver:
help='write checkpoint every SECONDS seconds (Default: 1800)')
archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
type=ChunkerParams, default=CHUNKER_PARAMS,
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
archive_group.add_argument('-C', '--compression', metavar='COMPRESSION', dest='compression',
type=CompressionSpec, default=CompressionSpec('lz4'),
help='select compression algorithm, see the output of the '
@ -3768,9 +3768,9 @@ class Archiver:
'do not recompress.')
archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
type=ChunkerParams, default=CHUNKER_PARAMS,
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
'HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the current defaults. '
'default: %d,%d,%d,%d' % CHUNKER_PARAMS)
'default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
subparser.add_argument('location', metavar='REPOSITORY_OR_ARCHIVE', nargs='?', default='',
type=location_validator(),

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
API_VERSION = '1.1_01'
API_VERSION = '1.1_02'
from libc.stdlib cimport free
@ -18,6 +18,17 @@ cdef extern from "_chunker.c":
cdef class Chunker:
"""
Content-Defined Chunker, variable chunk sizes.
This chunker does quite some effort to mostly cut the same-content chunks, even if
the content moves to a different offset inside the file. It uses the buzhash
rolling-hash algorithm to identify the chunk cutting places by looking at the
content inside the moving window and computing the rolling hash value over the
window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
Additionally it obeys some more criteria, like a minimum and maximum chunk size.
It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
"""
cdef _Chunker *chunker
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
@ -50,6 +61,20 @@ cdef class Chunker:
return chunker_process(self.chunker)
def get_chunker(algo, *params, **kw):
if algo == 'buzhash':
seed = kw['seed']
return Chunker(seed, *params)
raise TypeError('unsupported chunker algo %r' % algo)
def max_chunk_size(algo, *params):
# see also parseformat.ChunkerParams return values
if algo == 'buzhash':
return 1 << params[1]
raise TypeError('unsupported chunker algo %r' % algo)
def buzhash(data, unsigned long seed):
cdef uint32_t *table
cdef uint32_t sum

View File

@ -60,10 +60,10 @@ HASH_WINDOW_SIZE = 0xfff # 4095B
HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically
# defaults, use --chunker-params to override
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
# chunker params for the items metadata stream, finer granularity
ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE)
ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE)
# operating mode of the files cache (for fast skipping of unchanged files)
DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'

View File

@ -27,7 +27,7 @@ def check_extension_modules():
from .. import platform, compress, item, chunker, hashindex
if hashindex.API_VERSION != '1.1_07':
raise ExtensionModuleError
if chunker.API_VERSION != '1.1_01':
if chunker.API_VERSION != '1.1_02':
raise ExtensionModuleError
if compress.API_VERSION != '1.1_06':
raise ExtensionModuleError
@ -35,5 +35,5 @@ def check_extension_modules():
raise ExtensionModuleError
if platform.API_VERSION != platform.OS_API_VERSION or platform.API_VERSION != '1.2_03':
raise ExtensionModuleError
if item.API_VERSION != '1.1_03':
if item.API_VERSION != '1.1_04':
raise ExtensionModuleError

View File

@ -108,12 +108,20 @@ def timestamp(s):
def ChunkerParams(s):
if s.strip().lower() == "default":
params = s.strip().split(',')
count = len(params)
if count == 0:
raise ValueError('no chunker params given')
algo = params[0].lower()
if algo == 'default' and count == 1: # default
return CHUNKER_PARAMS
chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
if int(chunk_max) > 23:
raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
# this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
if algo == 'buzhash' and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
if chunk_max > 23:
raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
raise ValueError('invalid chunker params')
def FilesCacheMode(s):

View File

@ -12,7 +12,7 @@ cdef extern from "_item.c":
object _optr_to_object(object bytes)
API_VERSION = '1.1_03'
API_VERSION = '1.1_04'
class PropDict:
@ -325,6 +325,18 @@ class Key(PropDict):
tam_required = PropDict._make_property('tam_required', bool)
def tuple_encode(t):
"""encode a tuple that might contain str items"""
# we have str, but want to give bytes to msgpack.pack
return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
def tuple_decode(t):
"""decode a tuple that might contain bytes items"""
# we get bytes objects from msgpack.unpack, but want str
return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
class ArchiveItem(PropDict):
"""
ArchiveItem abstraction that deals with validation and the low-level details internally:
@ -353,7 +365,7 @@ class ArchiveItem(PropDict):
time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
chunker_params = PropDict._make_property('chunker_params', tuple)
chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str
recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str

View File

@ -1,6 +1,6 @@
from io import BytesIO
from ..chunker import Chunker, buzhash, buzhash_update
from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
from ..constants import * # NOQA
from . import BaseTestCase
@ -41,5 +41,6 @@ class ChunkerTestCase(BaseTestCase):
self.input = self.input[:-1]
return self.input[:1]
reconstructed = b''.join(Chunker(0, *CHUNKER_PARAMS).chunkify(SmallReadFile()))
chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
assert reconstructed == b'a' * 20

View File

@ -309,10 +309,14 @@ class FormatTimedeltaTestCase(BaseTestCase):
def test_chunkerparams():
assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
assert ChunkerParams('default') == ('buzhash', 19, 23, 21, 4095)
assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
with pytest.raises(ValueError):
ChunkerParams('19,24,21,4095')
with pytest.raises(ValueError):
ChunkerParams('crap,1,2,3,4')
class MakePathSafeTestCase(BaseTestCase):