create --sparse, file map support for the "fixed" chunker, see #14

a file map can be:

- created internally inside chunkify by calling sparsemap, which uses
  SEEK_DATA / SEEK_HOLE to determine data and hole ranges inside a
  seekable sparse file.
  Usage: borg create --sparse --chunker-params=fixed,BLOCKSIZE ...
  BLOCKSIZE is the chunker blocksize here, not the filesystem blocksize!

- made by some other means and given to the chunkify function.
  this is not used yet, but in future this could be used to only read
  the changed parts and seek over the (known) unchanged parts of a file.

sparsemap: the generate range sizes are multiples of the fs block size.
           the tests assume 4kiB fs block size.
This commit is contained in:
Thomas Waldmann 2020-12-11 00:34:11 +01:00
parent 5d46395ed0
commit b8bb0494f6
7 changed files with 334 additions and 47 deletions

View File

@ -596,14 +596,20 @@ The fixed chunker triggers (chunks) at even-spaced offsets, e.g. every 4MiB,
producing chunks of same block size (the last chunk is not required to be
full-size).
Optionally, it can cut the first "header" chunk with a different size (the
default is not to have a differently sized header chunk).
Optionally, it supports processing a differently sized "header" first, before
it starts to cut chunks of the desired block size.
The default is not to have a differently sized header.
``borg create --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``
- BLOCK_SIZE: no default value, multiple of the system page size (usually 4096
bytes) recommended. E.g.: 4194304 would cut 4MiB sized chunks.
- HEADER_SIZE: optional, defaults to 0 (no header chunk).
- HEADER_SIZE: optional, defaults to 0 (no header).
The fixed chunker also supports processing sparse files (reading only the ranges
with data and seeking over the empty hole ranges).
``borg create --sparse --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``
"buzhash" chunker
+++++++++++++++++

View File

@ -43,7 +43,10 @@ Examples
$ borg create --chunker-params buzhash,10,23,16,4095 /path/to/repo::small /smallstuff
# Backup a raw device (must not be active/in use/mounted at that time)
$ dd if=/dev/sdx bs=4M | borg create --chunker-params fixed,4194304 /path/to/repo::my-sdx -
$ borg create --read-special --chunker-params fixed,4194304 /path/to/repo::my-sdx /dev/sdX
# Backup a sparse disk image (must not be active/in use/mounted at that time)
$ borg create --sparse --chunker-params fixed,4194304 /path/to/repo::my-disk my-disk.raw
# No compression (none)
$ borg create --compression none /path/to/repo::arch ~

View File

@ -1172,7 +1172,7 @@ class FilesystemObjectProcessors:
def __init__(self, *, metadata_collector, cache, key,
add_item, process_file_chunks,
chunker_params, show_progress):
chunker_params, show_progress, sparse):
self.metadata_collector = metadata_collector
self.cache = cache
self.key = key
@ -1183,7 +1183,7 @@ class FilesystemObjectProcessors:
self.hard_links = {}
self.stats = Statistics() # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):

View File

@ -653,7 +653,7 @@ class Archiver:
checkpoint_interval=args.checkpoint_interval, rechunkify=False)
fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key,
process_file_chunks=cp.process_file_chunks, add_item=archive.add_item,
chunker_params=args.chunker_params, show_progress=args.progress)
chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse)
create_inner(archive, cache, fso)
else:
create_inner(None, None, None)
@ -3354,6 +3354,8 @@ class Archiver:
help='deprecated, use ``--noflags`` instead')
fs_group.add_argument('--noflags', dest='noflags', action='store_true',
help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive')
fs_group.add_argument('--sparse', dest='sparse', action='store_true',
help='detect sparse holes in input (supported only by fixed chunker)')
fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode',
type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI,
help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI)

View File

@ -2,6 +2,7 @@
API_VERSION = '1.2_01'
import errno
import os
from libc.stdlib cimport free
@ -19,65 +20,170 @@ cdef extern from "_chunker.c":
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
def dread(offset, size, fd=None, fh=-1):
use_fh = fh >= 0
if use_fh:
data = os.read(fh, size)
if hasattr(os, 'posix_fadvise'):
# UNIX only and, in case of block sizes that are not a multiple of the
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
# see comment/workaround in _chunker.c and borgbackup issue #907.
os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
return data
else:
return fd.read(size)
def dseek(amount, whence, fd=None, fh=-1):
use_fh = fh >= 0
if use_fh:
return os.lseek(fh, amount, whence)
else:
return fd.seek(amount, whence)
def dpos_curr_end(fd=None, fh=-1):
"""
determine current position, file end position (== file length)
"""
curr = dseek(0, os.SEEK_CUR, fd, fh)
end = dseek(0, os.SEEK_END, fd, fh)
dseek(curr, os.SEEK_SET, fd, fh)
return curr, end
def sparsemap(fd=None, fh=-1):
"""
generator yielding a (start, length, is_data) tuple for each range.
is_data is indicating data ranges (True) or hole ranges (False).
note:
the map is generated starting from the current seek position (it
is not required to be 0 / to be at the start of the file) and
work from there up to the end of the file.
when the generator is finished, the file pointer position will be
reset to where it was before calling this function.
"""
curr, file_len = dpos_curr_end(fd, fh)
start = curr
try:
whence = os.SEEK_HOLE
while True:
is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole
try:
end = dseek(start, whence, fd, fh)
except OSError as e:
if e.errno == errno.ENXIO:
if not is_data and start < file_len:
# if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
# (because we run into ENXIO), thus we must manually deal with this case:
end = file_len
yield (start, end - start, is_data)
break
else:
raise
# we do not want to yield zero-length ranges with start == end:
if end > start:
yield (start, end - start, is_data)
start = end
whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
finally:
# seek to same position as before calling this function
dseek(curr, os.SEEK_SET, fd, fh)
class ChunkerFixed:
"""
Fixed blocksize Chunker, optionally supporting a header block of different size.
This is a very simple chunker for input data with known block/record sizes:
This is a simple chunker for input data with data usually staying at same
offset and / or with known block/record sizes:
- raw disk images
- block devices
- database files with simple header + fixed-size records layout
Note: the last block of the input data may be less than the block size,
It optionally supports:
- a header block of different size
- using a sparsemap to only read data ranges and seek over hole ranges
for sparse files.
- using an externally given filemap to only read specific ranges from
a file.
Note: the last block of a data or hole range may be less than the block size,
this is supported and not considered to be an error.
"""
def __init__(self, block_size, header_size=0):
def __init__(self, block_size, header_size=0, sparse=False):
self.block_size = block_size
self.header_size = header_size
# should borg try to do sparse input processing?
# whether it actually can be done depends on the input file being seekable.
self.try_sparse = sparse and hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
self.zeros = memoryview(bytes(block_size))
def chunkify(self, fd, fh=-1):
def chunkify(self, fd=None, fh=-1, fmap=None):
"""
Cut a file into chunks.
:param fd: Python file object
:param fh: OS-level file handle (if available),
defaults to -1 which means not to use OS-level fd.
:param fmap: a file map, same format as generated by sparsemap
"""
if fmap is None:
if self.try_sparse:
try:
if self.header_size > 0:
header_map = [(0, self.header_size, True), ]
dseek(self.header_size, os.SEEK_SET, fd, fh)
body_map = list(sparsemap(fd, fh))
dseek(0, os.SEEK_SET, fd, fh)
else:
header_map = []
body_map = list(sparsemap(fd, fh))
except OSError as err:
# seeking did not work
pass
else:
fmap = header_map + body_map
if fmap is None:
# either sparse processing (building the fmap) was not tried or it failed.
# in these cases, we just build a "fake fmap" that considers the whole file
# as range(s) of data (no holes), so we can use the same code.
# we build different fmaps here for the purpose of correct block alignment
# with or without a header block (of potentially different size).
if self.header_size > 0:
header_map = [(0, self.header_size, True), ]
body_map = [(self.header_size, 2 ** 62, True), ]
else:
header_map = []
body_map = [(0, 2 ** 62, True), ]
fmap = header_map + body_map
offset = 0
use_fh = fh >= 0
if use_fh:
def read(size):
nonlocal offset
data = os.read(fh, size)
amount = len(data)
if hasattr(os, 'posix_fadvise'):
# UNIX only and, in case of block sizes that are not a multiple of the
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
# see comment/workaround in _chunker.c and borgbackup issue #907.
os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
offset += amount
return data
else:
def read(size):
nonlocal offset
data = fd.read(size)
amount = len(data)
offset += amount
return data
if self.header_size > 0:
data = read(self.header_size)
if data:
yield data
else:
data = True # get into next while loop
while data:
data = read(self.block_size)
if data:
yield data
# empty data means we are at EOF and we terminate the generator.
for range_start, range_size, is_data in fmap:
if range_start != offset:
# this is for the case when the fmap does not cover the file completely,
# e.g. it could be without the ranges of holes or of unchanged data.
offset = range_start
dseek(offset, os.SEEK_SET, fd, fh)
while range_size:
wanted = min(range_size, self.block_size)
if is_data:
# read block from the range
data = dread(offset, wanted, fd, fh)
else: # hole
# seek over block from the range
pos = dseek(wanted, os.SEEK_CUR, fd, fh)
data = self.zeros[:pos - offset] # for now, create zero-bytes here
got = len(data)
if got > 0:
offset += got
range_size -= got
yield data # later, use a better api that tags data vs. hole
if got < wanted:
# we did not get enough data, looks like EOF.
return
cdef class Chunker:
@ -129,7 +235,8 @@ def get_chunker(algo, *params, **kw):
seed = kw['seed']
return Chunker(seed, *params)
if algo == 'fixed':
return ChunkerFixed(*params)
sparse = kw['sparse']
return ChunkerFixed(*params, sparse=sparse)
raise TypeError('unsupported chunker algo %r' % algo)

View File

@ -22,6 +22,55 @@ class ChunkerFixedTestCase(BaseTestCase):
parts = [c for c in chunker.chunkify(BytesIO(data))]
self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
def test_chunkify_just_blocks_fmap_complete(self):
data = b'foobar' * 1500
chunker = ChunkerFixed(4096)
fmap = [
(0, 4096, True),
(4096, 8192, True),
(8192, 99999999, True),
]
parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
def test_chunkify_header_and_blocks_fmap_complete(self):
data = b'foobar' * 1500
chunker = ChunkerFixed(4096, 123)
fmap = [
(0, 123, True),
(123, 4096, True),
(123+4096, 4096, True),
(123+8192, 4096, True),
]
parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
def test_chunkify_header_and_blocks_fmap_zeros(self):
data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
chunker = ChunkerFixed(4096, 123)
fmap = [
(0, 123, True),
(123, 4096, False),
(123+4096, 4096, True),
(123+8192, 4096, False),
]
parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
# because we marked the '_' ranges as holes, we will get '\0' ranges instead!
self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
def test_chunkify_header_and_blocks_fmap_partial(self):
data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
chunker = ChunkerFixed(4096, 123)
fmap = [
(0, 123, True),
# (123, 4096, False),
(123+4096, 4096, True),
# (123+8192, 4096, False),
]
parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
# because we left out the '_' ranges from the fmap, we will not get them at all!
self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
class ChunkerTestCase(BaseTestCase):

View File

@ -0,0 +1,120 @@
from io import BytesIO
import os
import pytest
from ..chunker import ChunkerFixed, sparsemap
from ..constants import * # NOQA
BS = 4096 # fs block size
# some sparse files. X = content blocks, _ = sparse blocks.
# X__XXX____
map_sparse1 = [
(0 * BS, 1 * BS, True),
(1 * BS, 2 * BS, False),
(3 * BS, 3 * BS, True),
(6 * BS, 4 * BS, False),
]
# _XX___XXXX
map_sparse2 = [
(0 * BS, 1 * BS, False),
(1 * BS, 2 * BS, True),
(3 * BS, 3 * BS, False),
(6 * BS, 4 * BS, True),
]
# XXX
map_notsparse = [(0 * BS, 3 * BS, True), ]
# ___
map_onlysparse = [(0 * BS, 3 * BS, False), ]
def make_sparsefile(fname, sparsemap, header_size=0):
with open(fname, 'wb') as fd:
total = 0
if header_size:
fd.write(b'H' * header_size)
total += header_size
for offset, size, is_data in sparsemap:
if is_data:
fd.write(b'X' * size)
else:
fd.seek(size, os.SEEK_CUR)
total += size
fd.truncate(total)
assert os.path.getsize(fname) == total
def make_content(sparsemap, header_size=0):
with BytesIO() as fd:
total = 0
if header_size:
fd.write(b'H' * header_size)
total += header_size
for offset, size, is_data in sparsemap:
if is_data:
fd.write(b'X' * size)
else:
fd.write(b'\0' * size)
total += size
content = fd.getvalue()
assert len(content) == total
return content
@pytest.mark.parametrize("fname, sparse_map", [
('sparse1', map_sparse1),
('sparse2', map_sparse2),
('onlysparse', map_onlysparse),
('notsparse', map_notsparse),
])
def test_sparsemap(tmpdir, fname, sparse_map):
def get_sparsemap_fh(fname):
fh = os.open(fname, flags=os.O_RDONLY)
try:
return list(sparsemap(fh=fh))
finally:
os.close(fh)
def get_sparsemap_fd(fname):
with open(fname, 'rb') as fd:
return list(sparsemap(fd=fd))
fn = str(tmpdir / fname)
make_sparsefile(fn, sparse_map)
assert get_sparsemap_fh(fn) == sparse_map
assert get_sparsemap_fd(fn) == sparse_map
@pytest.mark.parametrize("fname, sparse_map, header_size, sparse", [
('sparse1', map_sparse1, 0, False),
('sparse1', map_sparse1, 0, True),
('sparse1', map_sparse1, BS, False),
('sparse1', map_sparse1, BS, True),
('sparse2', map_sparse2, 0, False),
('sparse2', map_sparse2, 0, True),
('sparse2', map_sparse2, BS, False),
('sparse2', map_sparse2, BS, True),
('onlysparse', map_onlysparse, 0, False),
('onlysparse', map_onlysparse, 0, True),
('onlysparse', map_onlysparse, BS, False),
('onlysparse', map_onlysparse, BS, True),
('notsparse', map_notsparse, 0, False),
('notsparse', map_notsparse, 0, True),
('notsparse', map_notsparse, BS, False),
('notsparse', map_notsparse, BS, True),
])
def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
def get_chunks(fname, sparse, header_size):
chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
with open(fname, 'rb') as fd:
return b''.join([c for c in chunker.chunkify(fd)])
fn = str(tmpdir / fname)
make_sparsefile(fn, sparse_map, header_size=header_size)
get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)