mirror of https://github.com/borgbackup/borg.git
add fixed blocksize chunker, fixes #1086
This commit is contained in:
parent
c4ffbd2a17
commit
80e0b42f7d
|
@ -1,6 +1,8 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
API_VERSION = '1.1_02'
|
||||
API_VERSION = '1.1_03'
|
||||
|
||||
import os
|
||||
|
||||
from libc.stdlib cimport free
|
||||
|
||||
|
@ -17,6 +19,67 @@ cdef extern from "_chunker.c":
|
|||
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
|
||||
|
||||
|
||||
class ChunkerFixed:
|
||||
"""
|
||||
Fixed blocksize Chunker, optionally supporting a header block of different size.
|
||||
|
||||
This is a very simple chunker for input data with known block/record sizes:
|
||||
|
||||
- raw disk images
|
||||
- block devices
|
||||
- database files with simple header + fixed-size records layout
|
||||
|
||||
Note: the last block of the input data may be less than the block size,
|
||||
this is supported and not considered to be an error.
|
||||
"""
|
||||
def __init__(self, block_size, header_size=0):
|
||||
self.block_size = block_size
|
||||
self.header_size = header_size
|
||||
|
||||
def chunkify(self, fd, fh=-1):
|
||||
"""
|
||||
Cut a file into chunks.
|
||||
|
||||
:param fd: Python file object
|
||||
:param fh: OS-level file handle (if available),
|
||||
defaults to -1 which means not to use OS-level fd.
|
||||
"""
|
||||
offset = 0
|
||||
use_fh = fh >= 0
|
||||
|
||||
if use_fh:
|
||||
def read(size):
|
||||
nonlocal offset
|
||||
data = os.read(fh, size)
|
||||
amount = len(data)
|
||||
if hasattr(os, 'posix_fadvise'):
|
||||
# UNIX only and, in case of block sizes that are not a multiple of the
|
||||
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
|
||||
# see comment/workaround in _chunker.c and borgbackup issue #907.
|
||||
os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
|
||||
offset += amount
|
||||
return data
|
||||
else:
|
||||
def read(size):
|
||||
nonlocal offset
|
||||
data = fd.read(size)
|
||||
amount = len(data)
|
||||
offset += amount
|
||||
return data
|
||||
|
||||
if self.header_size > 0:
|
||||
data = read(self.header_size)
|
||||
if data:
|
||||
yield data
|
||||
else:
|
||||
data = True # get into next while loop
|
||||
while data:
|
||||
data = read(self.block_size)
|
||||
if data:
|
||||
yield data
|
||||
# empty data means we are at EOF and we terminate the generator.
|
||||
|
||||
|
||||
cdef class Chunker:
|
||||
"""
|
||||
Content-Defined Chunker, variable chunk sizes.
|
||||
|
@ -65,6 +128,8 @@ def get_chunker(algo, *params, **kw):
|
|||
if algo == 'buzhash':
|
||||
seed = kw['seed']
|
||||
return Chunker(seed, *params)
|
||||
if algo == 'fixed':
|
||||
return ChunkerFixed(*params)
|
||||
raise TypeError('unsupported chunker algo %r' % algo)
|
||||
|
||||
|
||||
|
@ -72,6 +137,8 @@ def max_chunk_size(algo, *params):
|
|||
# see also parseformat.ChunkerParams return values
|
||||
if algo == 'buzhash':
|
||||
return 1 << params[1]
|
||||
if algo == 'fixed':
|
||||
return max(params[0], params[1])
|
||||
raise TypeError('unsupported chunker algo %r' % algo)
|
||||
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ def check_extension_modules():
|
|||
from .. import platform, compress, item, chunker, hashindex
|
||||
if hashindex.API_VERSION != '1.1_07':
|
||||
raise ExtensionModuleError
|
||||
if chunker.API_VERSION != '1.1_02':
|
||||
if chunker.API_VERSION != '1.1_03':
|
||||
raise ExtensionModuleError
|
||||
if compress.API_VERSION != '1.1_06':
|
||||
raise ExtensionModuleError
|
||||
|
|
|
@ -113,6 +113,10 @@ def ChunkerParams(s):
|
|||
if count == 0:
|
||||
raise ValueError('no chunker params given')
|
||||
algo = params[0].lower()
|
||||
if algo == 'fixed' and 2 <= count <= 3: # fixed, block_size[, header_size]
|
||||
block_size = int(params[1])
|
||||
header_size = int(params[2]) if count == 3 else 0
|
||||
return algo, block_size, header_size
|
||||
if algo == 'default' and count == 1: # default
|
||||
return CHUNKER_PARAMS
|
||||
# this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from io import BytesIO
|
||||
|
||||
from ..chunker import Chunker, get_chunker, buzhash, buzhash_update
|
||||
from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
|
||||
from ..constants import * # NOQA
|
||||
from . import BaseTestCase
|
||||
|
||||
|
@ -8,6 +8,21 @@ from . import BaseTestCase
|
|||
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT
|
||||
|
||||
|
||||
class ChunkerFixedTestCase(BaseTestCase):
|
||||
|
||||
def test_chunkify_just_blocks(self):
|
||||
data = b'foobar' * 1500
|
||||
chunker = ChunkerFixed(4096)
|
||||
parts = [c for c in chunker.chunkify(BytesIO(data))]
|
||||
self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
|
||||
|
||||
def test_chunkify_header_and_blocks(self):
|
||||
data = b'foobar' * 1500
|
||||
chunker = ChunkerFixed(4096, 123)
|
||||
parts = [c for c in chunker.chunkify(BytesIO(data))]
|
||||
self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
|
||||
|
||||
|
||||
class ChunkerTestCase(BaseTestCase):
|
||||
|
||||
def test_chunkify(self):
|
||||
|
|
|
@ -315,6 +315,8 @@ def test_chunkerparams():
|
|||
assert ChunkerParams('10,23,16,4095') == ('buzhash', 10, 23, 16, 4095)
|
||||
with pytest.raises(ValueError):
|
||||
ChunkerParams('19,24,21,4095')
|
||||
assert ChunkerParams('fixed,4096') == ('fixed', 4096, 0)
|
||||
assert ChunkerParams('fixed,4096,200') == ('fixed', 4096, 200)
|
||||
with pytest.raises(ValueError):
|
||||
ChunkerParams('crap,1,2,3,4')
|
||||
|
||||
|
|
Loading…
Reference in New Issue