From c4ffbd2a173ec42cbf4e9c47bef2258ac36faeb8 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 5 Jan 2019 04:38:06 +0100
Subject: [PATCH 1/5] prepare to support multiple chunkers

---
 src/borg/archive.py             | 13 ++++++++-----
 src/borg/archiver.py            |  8 ++++----
 src/borg/chunker.pyx            | 27 ++++++++++++++++++++++++++-
 src/borg/constants.py           |  4 ++--
 src/borg/helpers/checks.py      |  4 ++--
 src/borg/helpers/parseformat.py | 18 +++++++++++++-----
 src/borg/item.pyx               | 16 ++++++++++++++--
 src/borg/testsuite/chunker.py   |  5 +++--
 src/borg/testsuite/helpers.py   |  8 ++++++--
 9 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 5581cf2fc..87ec4bc31 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -19,7 +19,7 @@ from .logger import create_logger
 logger = create_logger()
 
 from . import xattr
-from .chunker import Chunker
+from .chunker import get_chunker, max_chunk_size
 from .cache import ChunkListEntry
 from .crypto.key import key_factory
 from .compress import Compressor, CompressionSpec
@@ -242,7 +242,7 @@ class ChunkBuffer:
         self.packer = msgpack.Packer()
         self.chunks = []
         self.key = key
-        self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
+        self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
 
     def add(self, item):
         self.buffer.write(self.packer.pack(item.as_dict()))
@@ -610,7 +610,7 @@ Utilization of max. archive size: {csize_max:.0%}
                 if hardlink_set:
                     return
                 if sparse and self.zeros is None:
-                    self.zeros = b'\0' * (1 << self.chunker_params[1])
+                    self.zeros = b'\0' * max_chunk_size(*self.chunker_params)
                 with backup_io('open'):
                     fd = open(path, 'wb')
                 with fd:
@@ -1058,7 +1058,7 @@ class FilesystemObjectProcessors:
         self.hard_links = {}
         self.stats = Statistics()  # threading: done by cache (including progress)
         self.cwd = os.getcwd()
-        self.chunker = Chunker(key.chunk_seed, *chunker_params)
+        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
 
     @contextmanager
     def create_helper(self, path, st, status=None, hardlinkable=True):
@@ -1920,6 +1920,9 @@ class ArchiveRecreater:
         target = self.create_target_archive(target_name)
         # If the archives use the same chunker params, then don't rechunkify
         source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
+        if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int):
+            # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
+            source_chunker_params = ('buzhash', ) + source_chunker_params
         target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
         if target.recreate_rechunkify:
             logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
@@ -1927,7 +1930,7 @@ class ArchiveRecreater:
             cache=self.cache, key=self.key,
             add_item=target.add_item, write_checkpoint=target.write_checkpoint,
             checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
-        target.chunker = Chunker(self.key.chunk_seed, *target.chunker_params)
+        target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
         return target
 
     def create_target_archive(self, name):
diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 395189264..207585e24 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -3151,8 +3151,8 @@ class Archiver:
                                    help='write checkpoint every SECONDS seconds (Default: 1800)')
         archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
                                    type=ChunkerParams, default=CHUNKER_PARAMS,
-                                   help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
-                                        'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
+                                   help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
+                                        'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
         archive_group.add_argument('-C', '--compression', metavar='COMPRESSION', dest='compression',
                                    type=CompressionSpec, default=CompressionSpec('lz4'),
                                    help='select compression algorithm, see the output of the '
@@ -3768,9 +3768,9 @@ class Archiver:
                                         'do not recompress.')
         archive_group.add_argument('--chunker-params', metavar='PARAMS', dest='chunker_params',
                                    type=ChunkerParams, default=CHUNKER_PARAMS,
-                                   help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
+                                   help='specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
                                         'HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the current defaults. '
-                                        'default: %d,%d,%d,%d' % CHUNKER_PARAMS)
+                                        'default: %s,%d,%d,%d,%d' % CHUNKER_PARAMS)
 
         subparser.add_argument('location', metavar='REPOSITORY_OR_ARCHIVE', nargs='?', default='',
                                type=location_validator(),
diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index d2b44f686..5558155e1 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-API_VERSION = '1.1_01'
+API_VERSION = '1.1_02'
 
 from libc.stdlib cimport free
 
@@ -18,6 +18,17 @@ cdef extern from "_chunker.c":
 
 
 cdef class Chunker:
+    """
+    Content-Defined Chunker, variable chunk sizes.
+
+    This chunker does quite some effort to mostly cut the same-content chunks, even if
+    the content moves to a different offset inside the file. It uses the buzhash
+    rolling-hash algorithm to identify the chunk cutting places by looking at the
+    content inside the moving window and computing the rolling hash value over the
+    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
+    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
+    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
+    """
     cdef _Chunker *chunker
 
     def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
@@ -50,6 +61,20 @@ cdef class Chunker:
         return chunker_process(self.chunker)
 
 
+def get_chunker(algo, *params, **kw):
+    if algo == 'buzhash':
+        seed = kw['seed']
+        return Chunker(seed, *params)
+    raise TypeError('unsupported chunker algo %r' % algo)
+
+
+def max_chunk_size(algo, *params):
+    # see also parseformat.ChunkerParams return values
+    if algo == 'buzhash':
+        return 1 << params[1]
+    raise TypeError('unsupported chunker algo %r' % algo)
+
+
 def buzhash(data, unsigned long seed):
     cdef uint32_t *table
     cdef uint32_t sum
diff --git a/src/borg/constants.py b/src/borg/constants.py
index a2fce0954..7055a44e5 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -60,10 +60,10 @@ HASH_WINDOW_SIZE = 0xfff  # 4095B
 HASH_MASK_BITS = 21  # results in ~2MiB chunks statistically
 
 # defaults, use --chunker-params to override
-CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
+CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 
 # chunker params for the items metadata stream, finer granularity
-ITEMS_CHUNKER_PARAMS = (15, 19, 17, HASH_WINDOW_SIZE)
+ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE)
 
 # operating mode of the files cache (for fast skipping of unchanged files)
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py
index 2bd2883ed..f52e0ede3 100644
--- a/src/borg/helpers/checks.py
+++ b/src/borg/helpers/checks.py
@@ -27,7 +27,7 @@ def check_extension_modules():
     from .. import platform, compress, item, chunker, hashindex
     if hashindex.API_VERSION != '1.1_07':
         raise ExtensionModuleError
-    if chunker.API_VERSION != '1.1_01':
+    if chunker.API_VERSION != '1.1_02':
         raise ExtensionModuleError
     if compress.API_VERSION != '1.1_06':
         raise ExtensionModuleError
@@ -35,5 +35,5 @@ def check_extension_modules():
         raise ExtensionModuleError
     if platform.API_VERSION != platform.OS_API_VERSION or platform.API_VERSION != '1.2_03':
         raise ExtensionModuleError
-    if item.API_VERSION != '1.1_03':
+    if item.API_VERSION != '1.1_04':
         raise ExtensionModuleError
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index a407763bc..f2291f683 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -108,12 +108,20 @@ def timestamp(s):
 
 
 def ChunkerParams(s):
-    if s.strip().lower() == "default":
+    params = s.strip().split(',')
+    count = len(params)
+    if count == 0:
+        raise ValueError('no chunker params given')
+    algo = params[0].lower()
+    if algo == 'default' and count == 1:  # default
         return CHUNKER_PARAMS
-    chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
-    if int(chunk_max) > 23:
-        raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
-    return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
+    # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
+    if algo == 'buzhash' and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
+        chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
+        if chunk_max > 23:
+            raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
+        return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
+    raise ValueError('invalid chunker params')
 
 
 def FilesCacheMode(s):
diff --git a/src/borg/item.pyx b/src/borg/item.pyx
index 5e477e98f..754896114 100644
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@@ -12,7 +12,7 @@ cdef extern from "_item.c":
     object _optr_to_object(object bytes)
 
 
-API_VERSION = '1.1_03'
+API_VERSION = '1.1_04'
 
 
 class PropDict:
@@ -325,6 +325,18 @@ class Key(PropDict):
     tam_required = PropDict._make_property('tam_required', bool)
 
 
+def tuple_encode(t):
+    """encode a tuple that might contain str items"""
+    # we have str, but want to give bytes to msgpack.pack
+    return tuple(safe_encode(e) if isinstance(e, str) else e for e in t)
+
+
+def tuple_decode(t):
+    """decode a tuple that might contain bytes items"""
+    # we get bytes objects from msgpack.unpack, but want str
+    return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t)
+
+
 class ArchiveItem(PropDict):
     """
     ArchiveItem abstraction that deals with validation and the low-level details internally:
@@ -353,7 +365,7 @@ class ArchiveItem(PropDict):
     time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
     time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
     comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
-    chunker_params = PropDict._make_property('chunker_params', tuple)
+    chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode)
     recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
     recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
     recreate_args = PropDict._make_property('recreate_args', list)  # list of s-e-str
diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py
index 2a14bd604..3d56fea60 100644
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -1,6 +1,6 @@
 from io import BytesIO
 
-from ..chunker import Chunker, buzhash, buzhash_update
+from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
 from ..constants import *  # NOQA
 from . import BaseTestCase
 
@@ -41,5 +41,6 @@ class ChunkerTestCase(BaseTestCase):
                 self.input = self.input[:-1]
                 return self.input[:1]
 
-        reconstructed = b''.join(Chunker(0, *CHUNKER_PARAMS).chunkify(SmallReadFile()))
+        chunker = get_chunker(*CHUNKER_PARAMS, seed=0)
+        reconstructed = b''.join(chunker.chunkify(SmallReadFile()))
         assert reconstructed == b'a' * 20
diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py
index 2d329c075..400d168dd 100644
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -309,10 +309,14 @@ class FormatTimedeltaTestCase(BaseTestCase):
 
 
 def test_chunkerparams():
-    assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
-    assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
+    assert ChunkerParams('default') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
+    assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
     with pytest.raises(ValueError):
         ChunkerParams('19,24,21,4095')
+    with pytest.raises(ValueError):
+        ChunkerParams('crap,1,2,3,4')
 
 
 class MakePathSafeTestCase(BaseTestCase):

From 80e0b42f7d282df880e189dd9d90d93d37cb19e1 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 5 Jan 2019 04:40:25 +0100
Subject: [PATCH 2/5] add fixed blocksize chunker, fixes #1086

---
 src/borg/chunker.pyx            | 69 ++++++++++++++++++++++++++++++++-
 src/borg/helpers/checks.py      |  2 +-
 src/borg/helpers/parseformat.py |  4 ++
 src/borg/testsuite/chunker.py   | 17 +++++++-
 src/borg/testsuite/helpers.py   |  2 +
 5 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx
index 5558155e1..51c2a90b1 100644
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 
-API_VERSION = '1.1_02'
+API_VERSION = '1.1_03'
+
+import os
 
 from libc.stdlib cimport free
 
@@ -17,6 +19,67 @@ cdef extern from "_chunker.c":
     uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
 
 
+class ChunkerFixed:
+    """
+    Fixed blocksize Chunker, optionally supporting a header block of different size.
+
+    This is a very simple chunker for input data with known block/record sizes:
+
+    - raw disk images
+    - block devices
+    - database files with simple header + fixed-size records layout
+
+    Note: the last block of the input data may be less than the block size,
+          this is supported and not considered to be an error.
+    """
+    def __init__(self, block_size, header_size=0):
+        self.block_size = block_size
+        self.header_size = header_size
+
+    def chunkify(self, fd, fh=-1):
+        """
+        Cut a file into chunks.
+
+        :param fd: Python file object
+        :param fh: OS-level file handle (if available),
+                   defaults to -1 which means not to use OS-level fd.
+        """
+        offset = 0
+        use_fh = fh >= 0
+
+        if use_fh:
+            def read(size):
+                nonlocal offset
+                data = os.read(fh, size)
+                amount = len(data)
+                if hasattr(os, 'posix_fadvise'):
+                    # UNIX only and, in case of block sizes that are not a multiple of the
+                    # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
+                    # see comment/workaround in _chunker.c and borgbackup issue #907.
+                    os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
+                offset += amount
+                return data
+        else:
+            def read(size):
+                nonlocal offset
+                data = fd.read(size)
+                amount = len(data)
+                offset += amount
+                return data
+
+        if self.header_size > 0:
+            data = read(self.header_size)
+            if data:
+                yield data
+        else:
+            data = True  # get into next while loop
+        while data:
+            data = read(self.block_size)
+            if data:
+                yield data
+        # empty data means we are at EOF and we terminate the generator.
+
+
 cdef class Chunker:
     """
     Content-Defined Chunker, variable chunk sizes.
@@ -65,6 +128,8 @@ def get_chunker(algo, *params, **kw):
     if algo == 'buzhash':
         seed = kw['seed']
         return Chunker(seed, *params)
+    if algo == 'fixed':
+        return ChunkerFixed(*params)
     raise TypeError('unsupported chunker algo %r' % algo)
 
 
@@ -72,6 +137,8 @@ def max_chunk_size(algo, *params):
     # see also parseformat.ChunkerParams return values
     if algo == 'buzhash':
         return 1 << params[1]
+    if algo == 'fixed':
+        return max(params[0], params[1])
     raise TypeError('unsupported chunker algo %r' % algo)
 
 
diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py
index f52e0ede3..984f95f97 100644
--- a/src/borg/helpers/checks.py
+++ b/src/borg/helpers/checks.py
@@ -27,7 +27,7 @@ def check_extension_modules():
     from .. import platform, compress, item, chunker, hashindex
     if hashindex.API_VERSION != '1.1_07':
         raise ExtensionModuleError
-    if chunker.API_VERSION != '1.1_02':
+    if chunker.API_VERSION != '1.1_03':
         raise ExtensionModuleError
     if compress.API_VERSION != '1.1_06':
         raise ExtensionModuleError
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index f2291f683..f741bd4e9 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -113,6 +113,10 @@ def ChunkerParams(s):
     if count == 0:
         raise ValueError('no chunker params given')
     algo = params[0].lower()
+    if algo == 'fixed' and 2 <= count <= 3:  # fixed, block_size[, header_size]
+        block_size = int(params[1])
+        header_size = int(params[2]) if count == 3 else 0
+        return algo, block_size, header_size
     if algo == 'default' and count == 1:  # default
         return CHUNKER_PARAMS
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py
index 3d56fea60..c49e5be03 100644
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@@ -1,6 +1,6 @@
 from io import BytesIO
 
-from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
+from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
 from ..constants import *  # NOQA
 from . import BaseTestCase
 
@@ -8,6 +8,21 @@ from . import BaseTestCase
 #       See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
 
 
+class ChunkerFixedTestCase(BaseTestCase):
+
+    def test_chunkify_just_blocks(self):
+        data = b'foobar' * 1500
+        chunker = ChunkerFixed(4096)
+        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
+
+    def test_chunkify_header_and_blocks(self):
+        data = b'foobar' * 1500
+        chunker = ChunkerFixed(4096, 123)
+        parts = [c for c in chunker.chunkify(BytesIO(data))]
+        self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
+
+
 class ChunkerTestCase(BaseTestCase):
 
     def test_chunkify(self):
diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py
index 400d168dd..72e895ae3 100644
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -315,6 +315,8 @@ def test_chunkerparams():
     assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
     with pytest.raises(ValueError):
         ChunkerParams('19,24,21,4095')
+    assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0)
+    assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200)
     with pytest.raises(ValueError):
         ChunkerParams('crap,1,2,3,4')
 

From be2c06173387afbf8eb9ef8bd27c34cac1279af0 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 5 Jan 2019 06:44:07 +0100
Subject: [PATCH 3/5] chunker params parsing: add more validation

avoiding too large chunks that the repository can not store.

avoiding too small chunks that would create excessively many chunks
and way to much storage and management overhead. we only disallow
extreme cases, this does not mean that everything that is allowed
also makes sense in practice (and does not eat lots of memory and
storage space).
---
 src/borg/helpers/parseformat.py | 14 ++++++++++++++
 src/borg/testsuite/helpers.py   | 18 +++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index f741bd4e9..979065ac0 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -116,12 +116,26 @@ def ChunkerParams(s):
     if algo == 'fixed' and 2 <= count <= 3:  # fixed, block_size[, header_size]
         block_size = int(params[1])
         header_size = int(params[2]) if count == 3 else 0
+        if block_size < 64:
+            # we are only disallowing the most extreme cases of abuse here - this does NOT imply
+            # that cutting chunks of the minimum allowed size is efficient concerning storage
+            # or in-memory chunk management.
+            # choose the block (chunk) size wisely: if you have a lot of data and you cut
+            # it into very small chunks, you are asking for trouble!
+            raise ValueError('block_size must not be less than 64 Bytes')
+        if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE:
+            raise ValueError('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE)
         return algo, block_size, header_size
     if algo == 'default' and count == 1:  # default
         return CHUNKER_PARAMS
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
     if algo == 'buzhash' and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
+        if not (chunk_min <= chunk_mask <= chunk_max):
+            raise ValueError('required: chunk_min <= chunk_mask <= chunk_max')
+        if chunk_min < 6:
+            # see comment in 'fixed' algo check
+            raise ValueError('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)')
         if chunk_max > 23:
             raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
         return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py
index 72e895ae3..ae748809f 100644
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -9,6 +9,7 @@ from time import sleep
 import pytest
 
 from .. import platform
+from ..constants import MAX_DATA_SIZE
 from ..helpers import Location
 from ..helpers import Buffer
 from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders
@@ -313,12 +314,23 @@ def test_chunkerparams():
     assert ChunkerParams('19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
     assert ChunkerParams('buzhash,19,23,21,4095') == ('buzhash', 19, 23, 21, 4095)
     assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
-    with pytest.raises(ValueError):
-        ChunkerParams('19,24,21,4095')
     assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0)
     assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200)
+    # invalid values checking
     with pytest.raises(ValueError):
-        ChunkerParams('crap,1,2,3,4')
+        ChunkerParams('crap,1,2,3,4')  # invalid algo
+    with pytest.raises(ValueError):
+        ChunkerParams('buzhash,5,7,6,4095')  # too small min. size
+    with pytest.raises(ValueError):
+        ChunkerParams('buzhash,19,24,21,4095')  # too big max. size
+    with pytest.raises(ValueError):
+        ChunkerParams('buzhash,23,19,21,4095')  # violates min <= mask <= max
+    with pytest.raises(ValueError):
+        ChunkerParams('fixed,63')  # too small block size
+    with pytest.raises(ValueError):
+        ChunkerParams('fixed,%d,%d' % (MAX_DATA_SIZE + 1, 4096))  # too big block size
+    with pytest.raises(ValueError):
+        ChunkerParams('fixed,%d,%d' % (4096, MAX_DATA_SIZE + 1))  # too big header size
 
 
 class MakePathSafeTestCase(BaseTestCase):

From ac0803fe0b57bb7de7031e68d79913240c7f46b2 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Wed, 13 Feb 2019 04:36:09 +0100
Subject: [PATCH 4/5] chunker algorithms: use constants to avoid typos

---
 src/borg/archive.py             | 2 +-
 src/borg/constants.py           | 8 ++++++--
 src/borg/helpers/parseformat.py | 6 +++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 87ec4bc31..ef9a83b30 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -1922,7 +1922,7 @@ class ArchiveRecreater:
         source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
         if len(source_chunker_params) == 4 and isinstance(source_chunker_params[0], int):
             # this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
-            source_chunker_params = ('buzhash', ) + source_chunker_params
+            source_chunker_params = (CH_BUZHASH, ) + source_chunker_params
         target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
         if target.recreate_rechunkify:
             logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
diff --git a/src/borg/constants.py b/src/borg/constants.py
index 7055a44e5..ff8eabb8d 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -59,11 +59,15 @@ CHUNK_MAX_EXP = 23  # 2**23 == 8MiB
 HASH_WINDOW_SIZE = 0xfff  # 4095B
 HASH_MASK_BITS = 21  # results in ~2MiB chunks statistically
 
+# chunker algorithms
+CH_BUZHASH = 'buzhash'
+CH_FIXED = 'fixed'
+
 # defaults, use --chunker-params to override
-CHUNKER_PARAMS = ('buzhash', CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
+CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 
 # chunker params for the items metadata stream, finer granularity
-ITEMS_CHUNKER_PARAMS = ('buzhash', 15, 19, 17, HASH_WINDOW_SIZE)
+ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
 
 # operating mode of the files cache (for fast skipping of unchanged files)
 DEFAULT_FILES_CACHE_MODE_UI = 'ctime,size,inode'
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index 979065ac0..5490dabe6 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -113,7 +113,7 @@ def ChunkerParams(s):
     if count == 0:
         raise ValueError('no chunker params given')
     algo = params[0].lower()
-    if algo == 'fixed' and 2 <= count <= 3:  # fixed, block_size[, header_size]
+    if algo == CH_FIXED and 2 <= count <= 3:  # fixed, block_size[, header_size]
         block_size = int(params[1])
         header_size = int(params[2]) if count == 3 else 0
         if block_size < 64:
@@ -129,7 +129,7 @@ def ChunkerParams(s):
     if algo == 'default' and count == 1:  # default
         return CHUNKER_PARAMS
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
-    if algo == 'buzhash' and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
+    if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = [int(p) for p in params[count - 4:]]
         if not (chunk_min <= chunk_mask <= chunk_max):
             raise ValueError('required: chunk_min <= chunk_mask <= chunk_max')
@@ -138,7 +138,7 @@ def ChunkerParams(s):
             raise ValueError('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)')
         if chunk_max > 23:
             raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
-        return 'buzhash', chunk_min, chunk_max, chunk_mask, window_size
+        return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size
     raise ValueError('invalid chunker params')
 
 

From 7f46eb99aa06d2c662c66efb7c51b7f405faba3d Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Wed, 13 Feb 2019 06:30:13 +0100
Subject: [PATCH 5/5] update docs about fixed chunker and chunker algo spec
 needed

---
 docs/internals.rst                 |  3 +-
 docs/internals/data-structures.rst | 44 ++++++++++++++++++++++++------
 docs/internals/frontends.rst       |  1 +
 docs/internals/security.rst        | 21 ++++++++++----
 docs/usage/create.rst              |  4 +--
 docs/usage/notes.rst               | 14 ++++++++--
 6 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/docs/internals.rst b/docs/internals.rst
index 786125d00..d11f0bfed 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -19,7 +19,8 @@ specified when the backup was performed.
 Deduplication is performed globally across all data in the repository
 (multiple backups and even multiple hosts), both on data and file
 metadata, using :ref:`chunks` created by the chunker using the
-Buzhash_ algorithm.
+Buzhash_ algorithm ("buzhash" chunker) or a simpler fixed blocksize
+algorithm ("fixed" chunker).
 
 To actually perform the repository-wide deduplication, a hash of each
 chunk is checked against the :ref:`chunks cache <cache>`, which is a
diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
index dc09fa326..40d3c7aa5 100644
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -580,16 +580,43 @@ A chunk is stored as an object as well, of course.
 Chunks
 ~~~~~~
 
-The Borg chunker uses a rolling hash computed by the Buzhash_ algorithm.
-It triggers (chunks) when the last HASH_MASK_BITS bits of the hash are zero,
-producing chunks of 2^HASH_MASK_BITS Bytes on average.
+Borg has these chunkers:
+
+- "fixed": a simple, low cpu overhead, fixed blocksize chunker, optionally
+  supporting a header block of different size.
+- "buzhash": variable, content-defined blocksize, uses a rolling hash
+  computed by the Buzhash_ algorithm.
+
+For some more general usage hints see also ``--chunker-params``.
+
+"fixed" chunker
++++++++++++++++
+
+The fixed chunker triggers (chunks) at even-spaced offsets, e.g. every 4MiB,
+producing chunks of same block size (the last chunk is not required to be
+full-size).
+
+Optionally, it can cut the first "header" chunk with a different size (the
+default is not to have a differently sized header chunk).
+
+``borg create --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``
+
+- BLOCK_SIZE: no default value, multiple of the system page size (usually 4096
+  bytes) recommended. E.g.: 4194304 would cut 4MiB sized chunks.
+- HEADER_SIZE: optional, defaults to 0 (no header chunk).
+
+"buzhash" chunker
++++++++++++++++++
+
+The buzhash chunker triggers (chunks) when the last HASH_MASK_BITS bits of
+the hash are zero, producing chunks of 2^HASH_MASK_BITS Bytes on average.
 
 Buzhash is **only** used for cutting the chunks at places defined by the
 content, the buzhash value is **not** used as the deduplication criteria (we
 use a cryptographically strong hash/MAC over the chunk contents for this, the
 id_hash).
 
-``borg create --chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE``
+``borg create --chunker-params buzhash,CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE``
 can be used to tune the chunker parameters, the default is:
 
 - CHUNK_MIN_EXP = 19 (minimum chunk size = 2^19 B = 512 kiB)
@@ -602,8 +629,6 @@ for the repository, and stored encrypted in the keyfile. This is to prevent
 chunk size based fingerprinting attacks on your encrypted repo contents (to
 guess what files you have based on a specific set of chunk sizes).
 
-For some more general usage hints see also ``--chunker-params``.
-
 .. _cache:
 
 The cache
@@ -690,7 +715,8 @@ Indexes / Caches memory usage
 
 Here is the estimated memory usage of Borg - it's complicated::
 
-  chunk_count ~= total_file_size / 2 ^ HASH_MASK_BITS
+  chunk_size ~= 2 ^ HASH_MASK_BITS  (for buzhash chunker, BLOCK_SIZE for fixed chunker)
+  chunk_count ~= total_file_size / chunk_size
 
   repo_index_usage = chunk_count * 40
 
@@ -732,11 +758,11 @@ For small hash tables, we start with a growth factor of 2, which comes down to
 
 E.g. backing up a total count of 1 Mi (IEC binary prefix i.e. 2^20) files with a total size of 1TiB.
 
-a) with ``create --chunker-params 10,23,16,4095`` (custom, like borg < 1.0 or attic):
+a) with ``create --chunker-params buzhash,10,23,16,4095`` (custom, like borg < 1.0 or attic):
 
   mem_usage  =  2.8GiB
 
-b) with ``create --chunker-params 19,23,21,4095`` (default):
+b) with ``create --chunker-params buzhash,19,23,21,4095`` (default):
 
   mem_usage  =  0.31GiB
 
diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst
index 47c805173..3a5c4fd13 100644
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@@ -376,6 +376,7 @@ The same archive with more information (``borg info --last 1 --json``)::
         "archives": [
             {
                 "chunker_params": [
+                    "buzhash",
                     13,
                     23,
                     16,
diff --git a/docs/internals/security.rst b/docs/internals/security.rst
index 936cf4988..4a0cc0701 100644
--- a/docs/internals/security.rst
+++ b/docs/internals/security.rst
@@ -396,16 +396,27 @@ Stored chunk sizes
 A borg repository does not hide the size of the chunks it stores (size
 information is needed to operate the repository).
 
-The chunks stored are the (compressed and encrypted) output of the chunker,
-chunked according to the input data, the chunker's parameters and the secret
-chunker seed (which all influence the chunk boundary positions).
+The chunks stored in the repo are the (compressed, encrypted and authenticated)
+output of the chunker. The sizes of these stored chunks are influenced by the
+compression, encryption and authentication.
+
+buzhash chunker
++++++++++++++++
+
+The buzhash chunker chunks according to the input data, the chunker's
+parameters and the secret chunker seed (which all influence the chunk boundary
+positions).
 
 Small files below some specific threshold (default: 512kiB) result in only one
 chunk (identical content / size as the original file), bigger files result in
 multiple chunks.
 
-After chunking is done, compression, encryption and authentication are applied,
-which influence the sizes of the chunks stored into the repository.
+fixed chunker
++++++++++++++
+
+This chunker yields fixed sized chunks, with optional support of a differently
+sized header chunk. The last chunk is not required to have the full block size
+and is determined by the input file size.
 
 Within our attack model, an attacker posessing a specific set of files which
 he assumes that the victim also posesses (and backups into the repository)
diff --git a/docs/usage/create.rst b/docs/usage/create.rst
index d221c4e65..6529eeffc 100644
--- a/docs/usage/create.rst
+++ b/docs/usage/create.rst
@@ -36,10 +36,10 @@ Examples
     # Make a big effort in fine granular deduplication (big chunk management
     # overhead, needs a lot of RAM and disk space, see formula in internals
     # docs - same parameters as borg < 1.0 or attic):
-    $ borg create --chunker-params 10,23,16,4095 /path/to/repo::small /smallstuff
+    $ borg create --chunker-params buzhash,10,23,16,4095 /path/to/repo::small /smallstuff
 
     # Backup a raw device (must not be active/in use/mounted at that time)
-    $ dd if=/dev/sdx bs=10M | borg create /path/to/repo::my-sdx -
+    $ dd if=/dev/sdx bs=4M | borg create --chunker-params fixed,4194304 /path/to/repo::my-sdx -
 
     # No compression (none)
     $ borg create --compression none /path/to/repo::arch ~
diff --git a/docs/usage/notes.rst b/docs/usage/notes.rst
index 4e190c213..ea54734da 100644
--- a/docs/usage/notes.rst
+++ b/docs/usage/notes.rst
@@ -14,16 +14,26 @@ resource usage (RAM and disk space) as the amount of resources needed is
 (also) determined by the total amount of chunks in the repository (see
 :ref:`cache-memory-usage` for details).
 
-``--chunker-params=10,23,16,4095`` results in a fine-grained deduplication|
+``--chunker-params=buzhash,10,23,16,4095`` results in a fine-grained deduplication|
 and creates a big amount of chunks and thus uses a lot of resources to manage
 them. This is good for relatively small data volumes and if the machine has a
 good amount of free RAM and disk space.
 
-``--chunker-params=19,23,21,4095`` (default) results in a coarse-grained
+``--chunker-params=buzhash,19,23,21,4095`` (default) results in a coarse-grained
 deduplication and creates a much smaller amount of chunks and thus uses less
 resources. This is good for relatively big data volumes and if the machine has
 a relatively low amount of free RAM and disk space.
 
+``--chunker-params=fixed,4194304`` results in fixed 4MiB sized block
+deduplication and is more efficient than the previous example when used for
+for block devices (like disks, partitions, LVM LVs) or raw disk image files.
+
+``--chunker-params=fixed,4096,512`` results in fixed 4kiB sized blocks,
+but the first header block will only be 512B long. This might be useful to
+dedup files with 1 header + N fixed size data blocks. Be careful to not
+produce a too big amount of chunks (like using small block size for huge
+files).
+
 If you already have made some archives in a repository and you then change
 chunker params, this of course impacts deduplication as the chunks will be
 cut differently.