mirror of https://github.com/borgbackup/borg.git
prepare to support multiple chunkers
This commit is contained in:
parent
de55d763a4
commit
c4ffbd2a17
|
@ -19,7 +19,7 @@ from .logger import create_logger
|
|||
logger = create_logger()
|
||||
|
||||
from . import xattr
|
||||
from .chunker import Chunker
|
||||
from .chunker import get_chunker, max_chunk_size
|
||||
from .cache import ChunkListEntry
|
||||
from .crypto.key import key_factory
|
||||
from .compress import Compressor, CompressionSpec
|
||||
|
@ -242,7 +242,7 @@ class ChunkBuffer:
|
|||
self.packer = msgpack.Packer()
|
||||
self.chunks = []
|
||||
self.key = key
|
||||
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
||||
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
|
||||
|
||||
def add(self, item):
|
||||
self.buffer.write(self.packer.pack(item.as_dict()))
|
||||
|
@ -610,7 +610,7 @@ Utilization of max. archive size: {csize_max:.0%}
|
|||
if hardlink_set:
|
||||
return
|
||||
if sparse and self.zeros is None:
|
||||
self.zeros = b'\0' * (1 << self.chunker_params[1])
|
||||
self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
|
||||
with backup_io('open'):
|
||||
fd = open(path, 'wb')
|
||||
with fd:
|
||||
|
@ -1058,7 +1058,7 @@ class FilesystemObjectProcessors:
|
|||
self.hard_links = {}
|
||||
self.stats = Statistics() # threading: done by cache (including progress)
|
||||
self.cwd = os.getcwd()
|
||||
self.chunker = Chunker(key.chunk_seed, *chunker_params)
|
||||
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
|
||||
|
||||
@contextmanager
|
||||
def create_helper(self, path, st, status=None, hardlinkable=True):
|
||||
|
@ -1920,6 +1920,9 @@ class ArchiveRecreater:
|
|||
target = self.create_target_archive(target_name)
|
||||
# If the archives use the same chunker params, then don't rechunkify
|
||||
source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
|
||||
if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int):
|
||||
# this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
|
||||
source_chunker_params = ('buzhash', ) + source_chunker_params
|
||||
target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
|
||||
if target.recreate_rechunkify:
|
||||
logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
|
||||
|
@ -1927,7 +1930,7 @@ class ArchiveRecreater:
|
|||
cache=self.cache, key=self.key,
|
||||
add_item=target.add_item, write_checkpoint=target.write_checkpoint,
|
||||
checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
|
||||
target.chunker = Chunker(self.key.chunk_seed, *target.chunker_params)
|
||||
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
|
||||
return target
|
||||
|
||||
def create_target_archive(self, name):
|
||||
|
|
|
@ -3151,8 +3151,8 @@ class Archiver:
|
|||
help='write checkpoint every SECONDS seconds (Default: 1800)')
|
||||
archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
|
||||
type=ChunkerParams, default=CHUNKER_PARAMS,
|
||||
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
|
||||
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
|
||||
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
archive_group.add_argument('-C', '--compression', metavar='COMPRESSION', dest='compression',
|
||||
type=CompressionSpec, default=CompressionSpec('lz4'),
|
||||
help='select compression algorithm, see the output of the '
|
||||
|
@ -3768,9 +3768,9 @@ class Archiver:
|
|||
'do not recompress.')
|
||||
archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
|
||||
type=ChunkerParams, default=CHUNKER_PARAMS,
|
||||
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
|
||||
help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
|
||||
'HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the current defaults. '
|
||||
'default: %d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
'default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
|
||||
subparser.add_argument('location', metavar='REPOSITORY_OR_ARCHIVE', nargs='?', default='',
|
||||
type=location_validator(),
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
API_VERSION = '1.1_01'
|
||||
API_VERSION = '1.1_02'
|
||||
|
||||
from libc.stdlib cimport free
|
||||
|
||||
|
@ -18,6 +18,17 @@ cdef extern from "_chunker.c":
|
|||
|
||||
|
||||
cdef class Chunker:
|
||||
"""
|
||||
Content-Defined Chunker, variable chunk sizes.
|
||||
|
||||
This chunker does quite some effort to mostly cut the same-content chunks, even if
|
||||
the content moves to a different offset inside the file. It uses the buzhash
|
||||
rolling-hash algorithm to identify the chunk cutting places by looking at the
|
||||
content inside the moving window and computing the rolling hash value over the
|
||||
window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
|
||||
Additionally it obeys some more criteria, like a minimum and maximum chunk size.
|
||||
It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
|
||||
"""
|
||||
cdef _Chunker *chunker
|
||||
|
||||
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
|
||||
|
@ -50,6 +61,20 @@ cdef class Chunker:
|
|||
return chunker_process(self.chunker)
|
||||
|
||||
|
||||
def get_chunker(algo, *params, **kw):
|
||||
if algo == 'buzhash':
|
||||
seed = kw['seed']
|
||||
return Chunker(seed, *params)
|
||||
raise TypeError('unsupported chunker algo %r' % algo)
|
||||
|
||||
|
||||
def max_chunk_size(algo, *params):
|
||||
# see also parseformat.ChunkerParams return values
|
||||
if algo == 'buzhash':
|
||||
return 1 << params[1]
|
||||
raise TypeError('unsupported chunker algo %r' % algo)
|
||||
|
||||
|
||||
def buzhash(data, unsigned long seed):
|
||||
cdef uint32_t *table
|
||||
cdef uint32_t sum
|
||||
|
|
|
@ -60,10 +60,10 @@ HASH_WINDOW_SIZE = 0xfff # 4095B
|
|||
HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically
|
||||
|
||||
# defaults, use --chunker-params to override
|
||||
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||
CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||
|
||||
# chunker params for the items metadata stream, finer granularity
|
||||
ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE)
|
||||
ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE)
|
||||
|
||||
# operating mode of the files cache (for fast skipping of unchanged files)
|
||||
DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
|
||||
|
|
|
@ -27,7 +27,7 @@ def check_extension_modules():
|
|||
from .. import platform, compress, item, chunker, hashindex
|
||||
if hashindex.API_VERSION != '1.1_07':
|
||||
raise ExtensionModuleError
|
||||
if chunker.API_VERSION != '1.1_01':
|
||||
if chunker.API_VERSION != '1.1_02':
|
||||
raise ExtensionModuleError
|
||||
if compress.API_VERSION != '1.1_06':
|
||||
raise ExtensionModuleError
|
||||
|
@ -35,5 +35,5 @@ def check_extension_modules():
|
|||
raise ExtensionModuleError
|
||||
if platform.API_VERSION != platform.OS_API_VERSION or platform.API_VERSION != '1.2_03':
|
||||
raise ExtensionModuleError
|
||||
if item.API_VERSION != '1.1_03':
|
||||
if item.API_VERSION != '1.1_04':
|
||||
raise ExtensionModuleError
|
||||
|
|
|
@ -108,12 +108,20 @@ def timestamp(s):
|
|||
|
||||
|
||||
def ChunkerParams(s):
|
||||
if s.strip().lower() == "default":
|
||||
params = s.strip().split(',')
|
||||
count = len(params)
|
||||
if count == 0:
|
||||
raise ValueError('no chunker params given')
|
||||
algo = params[0].lower()
|
||||
if algo == 'default' and count == 1: # default
|
||||
return CHUNKER_PARAMS
|
||||
chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
|
||||
if int(chunk_max) > 23:
|
||||
raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
|
||||
return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
|
||||
# this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
|
||||
if algo == 'buzhash' and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
|
||||
chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
|
||||
if chunk_max > 23:
|
||||
raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
|
||||
return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
|
||||
raise ValueError('invalid chunker params')
|
||||
|
||||
|
||||
def FilesCacheMode(s):
|
||||
|
|
|
@ -12,7 +12,7 @@ cdef extern from "_item.c":
|
|||
object _optr_to_object(object bytes)
|
||||
|
||||
|
||||
API_VERSION = '1.1_03'
|
||||
API_VERSION = '1.1_04'
|
||||
|
||||
|
||||
class PropDict:
|
||||
|
@ -325,6 +325,18 @@ class Key(PropDict):
|
|||
tam_required = PropDict._make_property('tam_required', bool)
|
||||
|
||||
|
||||
def tuple_encode(t):
|
||||
"""encode a tuple that might contain str items"""
|
||||
# we have str, but want to give bytes to msgpack.pack
|
||||
return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
|
||||
|
||||
|
||||
def tuple_decode(t):
|
||||
"""decode a tuple that might contain bytes items"""
|
||||
# we get bytes objects from msgpack.unpack, but want str
|
||||
return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
|
||||
|
||||
|
||||
class ArchiveItem(PropDict):
|
||||
"""
|
||||
ArchiveItem abstraction that deals with validation and the low-level details internally:
|
||||
|
@ -353,7 +365,7 @@ class ArchiveItem(PropDict):
|
|||
time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
|
||||
time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
|
||||
comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
|
||||
chunker_params = PropDict._make_property('chunker_params', tuple)
|
||||
chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
|
||||
recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
|
||||
recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str
|
||||
recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from io import BytesIO
|
||||
|
||||
from ..chunker import Chunker, buzhash, buzhash_update
|
||||
from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
|
||||
from ..constants import * # NOQA
|
||||
from . import BaseTestCase
|
||||
|
||||
|
@ -41,5 +41,6 @@ class ChunkerTestCase(BaseTestCase):
|
|||
self.input = self.input[:-1]
|
||||
return self.input[:1]
|
||||
|
||||
reconstructed = b''.join(Chunker(0, *CHUNKER_PARAMS).chunkify(SmallReadFile()))
|
||||
chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
|
||||
reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
|
||||
assert reconstructed == b'a' * 20
|
||||
|
|
|
@ -309,10 +309,14 @@ class FormatTimedeltaTestCase(BaseTestCase):
|
|||
|
||||
|
||||
def test_chunkerparams():
|
||||
assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
|
||||
assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
|
||||
assert ChunkerParams('default') == ('buzhash', 19, 23, 21, 4095)
|
||||
assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
|
||||
assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
|
||||
assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
|
||||
with pytest.raises(ValueError):
|
||||
ChunkerParams('19,24,21,4095')
|
||||
with pytest.raises(ValueError):
|
||||
ChunkerParams('crap,1,2,3,4')
|
||||
|
||||
|
||||
class MakePathSafeTestCase(BaseTestCase):
|
||||
|
|
Loading…
Reference in New Issue