Merge pull request #62 from ThomasWaldmann/chunker-params

Chunker params, fixes #16
2025-02-23 14:41:43 +00:00 · 2015-06-21 02:11:54 +02:00 · 2015-06-21 02:11:54 +02:00 · a487e16c16
commit a487e16c16
parent 44ec86460b 41a37e77db
8 changed files with 169 additions and 33 deletions
--- a/borg/archive.py
+++ b/borg/archive.py
@ -21,12 +21,14 @@
    Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int

 ITEMS_BUFFER = 1024 * 1024
-CHUNK_MIN = 1024
-CHUNK_MAX = 10 * 1024 * 1024
-WINDOW_SIZE = 0xfff
-CHUNK_MASK = 0xffff

-ZEROS = b'\0' * CHUNK_MAX
+CHUNK_MIN_EXP = 10  # 2**10 == 1kiB
+CHUNK_MAX_EXP = 23  # 2**23 == 8MiB
+HASH_WINDOW_SIZE = 0xfff  # 4095B
+HASH_MASK_BITS = 16  # results in ~64kiB chunks statistically
+
+# defaults, use --chunker-params to override
+CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)

 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
@ -69,12 +71,12 @@ def fetch_many(self, ids, is_preloaded=False):
 class ChunkBuffer:
    BUFFER_SIZE = 1 * 1024 * 1024

-    def __init__(self, key):
+    def __init__(self, key, chunker_params=CHUNKER_PARAMS):
        self.buffer = BytesIO()
        self.packer = msgpack.Packer(unicode_errors='surrogateescape')
        self.chunks = []
        self.key = key
-        self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
+        self.chunker = Chunker(self.key.chunk_seed, *chunker_params)

    def add(self, item):
        self.buffer.write(self.packer.pack(StableDict(item)))
@ -104,8 +106,8 @@ def is_full(self):

 class CacheChunkBuffer(ChunkBuffer):

-    def __init__(self, cache, key, stats):
-        super(CacheChunkBuffer, self).__init__(key)
+    def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
+        super(CacheChunkBuffer, self).__init__(key, chunker_params)
        self.cache = cache
        self.stats = stats

@ -127,7 +129,8 @@ class IncompatibleFilesystemEncodingError(Error):


    def __init__(self, repository, key, manifest, name, cache=None, create=False,
-                 checkpoint_interval=300, numeric_owner=False, progress=False):
+                 checkpoint_interval=300, numeric_owner=False, progress=False,
+                 chunker_params=CHUNKER_PARAMS):
        self.cwd = os.getcwd()
        self.key = key
        self.repository = repository
@ -142,8 +145,8 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
        self.numeric_owner = numeric_owner
        self.pipeline = DownloadPipeline(self.repository, self.key)
        if create:
-            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
-            self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
+            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
+            self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
            if name in manifest.archives:
                raise self.AlreadyExists(name)
            self.last_checkpoint = time.time()
@ -158,6 +161,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
                raise self.DoesNotExist(name)
            info = self.manifest.archives[name]
            self.load(info[b'id'])
+            self.zeros = b'\0' * (1 << chunker_params[1])

    def _load_meta(self, id):
        data = self.key.decrypt(id, self.repository.get(id))
@ -286,7 +290,7 @@ def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sp
                with open(path, 'wb') as fd:
                    ids = [c[0] for c in item[b'chunks']]
                    for data in self.pipeline.fetch_many(ids, is_preloaded=True):
-                        if sparse and ZEROS.startswith(data):
+                        if sparse and self.zeros.startswith(data):
                            # all-zero chunk: create a hole in a sparse file
                            fd.seek(len(data), 1)
                        else:
--- a/borg/archiver.py
+++ b/borg/archiver.py
@ -13,7 +13,7 @@
 import traceback

 from . import __version__
-from .archive import Archive, ArchiveChecker
+from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
 from .repository import Repository
 from .cache import Cache
 from .key import key_creator
@ -21,7 +21,7 @@
    format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
    get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
    Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
-    is_cachedir, bigint_to_int
+    is_cachedir, bigint_to_int, ChunkerParams
 from .remote import RepositoryServer, RemoteRepository


@ -104,7 +104,8 @@ def do_create(self, args):
        cache = Cache(repository, key, manifest, do_files=args.cache_files)
        archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
                          create=True, checkpoint_interval=args.checkpoint_interval,
-                          numeric_owner=args.numeric_owner, progress=args.progress)
+                          numeric_owner=args.numeric_owner, progress=args.progress,
+                          chunker_params=args.chunker_params)
        # Add cache dir to inode_skip list
        skip_inodes = set()
        try:
@ -625,6 +626,10 @@ def run(self, args=None):
                               metavar='yyyy-mm-ddThh:mm:ss',
                               help='manually specify the archive creation date/time (UTC). '
                                    'alternatively, give a reference file/directory.')
+        subparser.add_argument('--chunker-params', dest='chunker_params',
+                               type=ChunkerParams, default=CHUNKER_PARAMS,
+                               metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
+                               help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
        subparser.add_argument('archive', metavar='ARCHIVE',
                               type=location_validator(archive=True),
                               help='archive to create')
--- a/borg/chunker.pyx
+++ b/borg/chunker.pyx
@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
 cdef class Chunker:
    cdef _Chunker *chunker

-    def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
-        self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
+    def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        hash_mask = (1 << hash_mask_bits) - 1
+        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)

    def chunkify(self, fd, fh=-1):
        """
--- a/borg/helpers.py
+++ b/borg/helpers.py
@ -313,6 +313,11 @@ def timestamp(s):
        raise ValueError


+def ChunkerParams(s):
+    window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
+    return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)
+
+
 def is_cachedir(path):
    """Determines whether the specified path is a cache directory (and
    therefore should potentially be excluded from the backup) according to
--- a/borg/testsuite/archiver.py
+++ b/borg/testsuite/archiver.py
@ -12,7 +12,7 @@
 from hashlib import sha256

 from .. import xattr
-from ..archive import Archive, ChunkBuffer, CHUNK_MAX
+from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
 from ..archiver import Archiver
 from ..cache import Cache
 from ..crypto import bytes_to_long, num_aes_blocks
@ -213,7 +213,7 @@ def test_sparse_file(self):
        sparse_support = sys.platform != 'darwin'
        filename = os.path.join(self.input_path, 'sparse')
        content = b'foobar'
-        hole_size = 5 * CHUNK_MAX  # 5 full chunker buffers
+        hole_size = 5 * (1 << CHUNK_MAX_EXP)  # 5 full chunker buffers
        with open(filename, 'wb') as fd:
            # create a file that has a hole at the beginning and end (if the
            # OS and filesystem supports sparse files)
--- a/borg/testsuite/chunker.py
+++ b/borg/testsuite/chunker.py
@ -1,27 +1,27 @@
 from io import BytesIO

 from ..chunker import Chunker, buzhash, buzhash_update
-from ..archive import CHUNK_MAX
+from ..archive import CHUNK_MAX_EXP
 from . import BaseTestCase


 class ChunkerTestCase(BaseTestCase):

    def test_chunkify(self):
-        data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
-        parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
+        data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
+        parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
        self.assert_equal(len(parts), 2)
        self.assert_equal(b''.join(parts), data)
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
-        self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
+        self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
+        self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
+        self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])

    def test_buzhash(self):
        self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
--- a/docs/misc/create_chunker-params.txt
+++ b/docs/misc/create_chunker-params.txt
@ -0,0 +1,116 @@
+About borg create --chunker-params
+==================================
+
+--chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE
+
+CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and
+maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP.
+
+Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum.
+
+HASH_MASK_BITS is the number of least-significant bits of the rolling hash
+that need to be zero to trigger a chunk cut.
+Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2
+(this allows the rolling hash some freedom to make its cut at a place
+determined by the windows contents rather than the min/max. chunk size).
+
+Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size)
+
+HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
+Default: 4095B
+
+
+Trying it out
+=============
+
+I backed up a VM directory to demonstrate how different chunker parameters
+influence repo size, index size / chunk count, compression, deduplication.
+
+repo-sm: ~64kiB chunks (16 bits chunk mask), min chunk size 1kiB (2^10B)
+         (these are attic / borg 0.23 internal defaults)
+
+repo-lg: ~1MiB chunks (20 bits chunk mask), min chunk size 64kiB (2^16B)
+
+repo-xl: 8MiB chunks (2^23B max chunk size), min chunk size 64kiB (2^16B).
+         The chunk mask bits was set to 31, so it (almost) never triggers.
+         This degrades the rolling hash based dedup to a fixed-offset dedup
+         as the cutting point is now (almost) always the end of the buffer
+         (at 2^23B == 8MiB).
+
+The repo index size is an indicator for the RAM needs of Borg.
+In this special case, the total RAM needs are about 2.1x the repo index size.
+You see index size of repo-sm is 16x larger than of repo-lg, which corresponds
+to the ratio of the different target chunk sizes.
+
+Note: RAM needs were not a problem in this specific case (37GB data size).
+      But just imagine, you have 37TB of such data and much less than 42GB RAM,
+      then you'ld definitely want the "lg" chunker params so you only need
+      2.6GB RAM. Or even bigger chunks than shown for "lg" (see "xl").
+
+You also see compression works better for larger chunks, as expected.
+Duplication works worse for larger chunks, also as expected.
+
+small chunks
+============
+
+$ borg info /extra/repo-sm::1
+
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 10,23,16,4095 /extra/repo-sm::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.12 GB             14.81 GB             12.18 GB
+All archives:               37.12 GB             14.81 GB             12.18 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                  378374               487316
+
+$ ls -l /extra/repo-sm/index*
+
+-rw-rw-r-- 1 tw tw 20971538 Jun 20 23:39 index.2308
+
+$ du -sk /extra/repo-sm
+11930840    /extra/repo-sm
+
+large chunks
+============
+
+$ borg info /extra/repo-lg::1
+
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,20,4095 /extra/repo-lg::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.10 GB             14.60 GB             13.38 GB
+All archives:               37.10 GB             14.60 GB             13.38 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                   25889                29349
+
+$ ls -l /extra/repo-lg/index*
+
+-rw-rw-r-- 1 tw tw 1310738 Jun 20 23:10 index.2264
+
+$ du -sk /extra/repo-lg
+13073928    /extra/repo-lg
+
+xl chunks
+=========
+
+(borg-env)tw@tux:~/w/borg$ borg info /extra/repo-xl::1
+Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,31,4095 /extra/repo-xl::1 /home/tw/win
+Number of files: 3
+
+                       Original size      Compressed size    Deduplicated size
+This archive:               37.10 GB             14.59 GB             14.59 GB
+All archives:               37.10 GB             14.59 GB             14.59 GB
+
+                       Unique chunks         Total chunks
+Chunk index:                    4319                 4434
+
+$ ls -l /extra/repo-xl/index*
+-rw-rw-r-- 1 tw tw 327698 Jun 21 00:52 index.2011
+
+$ du -sk /extra/repo-xl/
+14253464    /extra/repo-xl/
+
--- a/docs/usage.rst
+++ b/docs/usage.rst
@ -50,6 +50,9 @@ Examples
    NAME="root-`date +%Y-%m-%d`"
    $ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints

+    # Backup huge files with little chunk management overhead
+    $ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs
+

 .. include:: usage/extract.rst.inc