From 80e0b42f7d282df880e189dd9d90d93d37cb19e1 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 5 Jan 2019 04:40:25 +0100 Subject: [PATCH] add fixed blocksize chunker, fixes #1086 --- src/borg/chunker.pyx | 69 ++++++++++++++++++++++++++++++++- src/borg/helpers/checks.py | 2 +- src/borg/helpers/parseformat.py | 4 ++ src/borg/testsuite/chunker.py | 17 +++++++- src/borg/testsuite/helpers.py | 2 + 5 files changed, 91 insertions(+), 3 deletions(-) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 5558155e1..51c2a90b1 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- -API_VERSION = '1.1_02' +API_VERSION = '1.1_03' + +import os from libc.stdlib cimport free @@ -17,6 +19,67 @@ cdef extern from "_chunker.c": uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h) +class ChunkerFixed: + """ + Fixed blocksize Chunker, optionally supporting a header block of different size. + + This is a very simple chunker for input data with known block/record sizes: + + - raw disk images + - block devices + - database files with simple header + fixed-size records layout + + Note: the last block of the input data may be less than the block size, + this is supported and not considered to be an error. + """ + def __init__(self, block_size, header_size=0): + self.block_size = block_size + self.header_size = header_size + + def chunkify(self, fd, fh=-1): + """ + Cut a file into chunks. + + :param fd: Python file object + :param fh: OS-level file handle (if available), + defaults to -1 which means not to use OS-level fd. + """ + offset = 0 + use_fh = fh >= 0 + + if use_fh: + def read(size): + nonlocal offset + data = os.read(fh, size) + amount = len(data) + if hasattr(os, 'posix_fadvise'): + # UNIX only and, in case of block sizes that are not a multiple of the + # system's page size, better be used with a bug fixed linux kernel > 4.6.0, + # see comment/workaround in _chunker.c and borgbackup issue #907. + os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED) + offset += amount + return data + else: + def read(size): + nonlocal offset + data = fd.read(size) + amount = len(data) + offset += amount + return data + + if self.header_size > 0: + data = read(self.header_size) + if data: + yield data + else: + data = True # get into next while loop + while data: + data = read(self.block_size) + if data: + yield data + # empty data means we are at EOF and we terminate the generator. + + cdef class Chunker: """ Content-Defined Chunker, variable chunk sizes. @@ -65,6 +128,8 @@ def get_chunker(algo, *params, **kw): if algo == 'buzhash': seed = kw['seed'] return Chunker(seed, *params) + if algo == 'fixed': + return ChunkerFixed(*params) raise TypeError('unsupported chunker algo %r' % algo) @@ -72,6 +137,8 @@ def max_chunk_size(algo, *params): # see also parseformat.ChunkerParams return values if algo == 'buzhash': return 1 << params[1] + if algo == 'fixed': + return max(params[0], params[1]) raise TypeError('unsupported chunker algo %r' % algo) diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py index f52e0ede3..984f95f97 100644 --- a/src/borg/helpers/checks.py +++ b/src/borg/helpers/checks.py @@ -27,7 +27,7 @@ def check_extension_modules(): from .. import platform, compress, item, chunker, hashindex if hashindex.API_VERSION != '1.1_07': raise ExtensionModuleError - if chunker.API_VERSION != '1.1_02': + if chunker.API_VERSION != '1.1_03': raise ExtensionModuleError if compress.API_VERSION != '1.1_06': raise ExtensionModuleError diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index f2291f683..f741bd4e9 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -113,6 +113,10 @@ def ChunkerParams(s): if count == 0: raise ValueError('no chunker params given') algo = params[0].lower() + if algo == 'fixed' and 2 <= count <= 3: # fixed, block_size[, header_size] + block_size = int(params[1]) + header_size = int(params[2]) if count == 3 else 0 + return algo, block_size, header_size if algo == 'default' and count == 1: # default return CHUNKER_PARAMS # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index 3d56fea60..c49e5be03 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -1,6 +1,6 @@ from io import BytesIO -from ..chunker import Chunker, get_chunker, buzhash, buzhash_update +from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update from ..constants import * # NOQA from . import BaseTestCase @@ -8,6 +8,21 @@ from . import BaseTestCase # See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT +class ChunkerFixedTestCase(BaseTestCase): + + def test_chunkify_just_blocks(self): + data = b'foobar' * 1500 + chunker = ChunkerFixed(4096) + parts = [c for c in chunker.chunkify(BytesIO(data))] + self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]]) + + def test_chunkify_header_and_blocks(self): + data = b'foobar' * 1500 + chunker = ChunkerFixed(4096, 123) + parts = [c for c in chunker.chunkify(BytesIO(data))] + self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]]) + + class ChunkerTestCase(BaseTestCase): def test_chunkify(self): diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 400d168dd..72e895ae3 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -315,6 +315,8 @@ def test_chunkerparams(): assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095) with pytest.raises(ValueError): ChunkerParams('19,24,21,4095') + assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0) + assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200) with pytest.raises(ValueError): ChunkerParams('crap,1,2,3,4')