diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index caaf75810..7d00ac322 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -596,14 +596,20 @@ The fixed chunker triggers (chunks) at even-spaced offsets, e.g. every 4MiB, producing chunks of same block size (the last chunk is not required to be full-size). -Optionally, it can cut the first "header" chunk with a different size (the -default is not to have a differently sized header chunk). +Optionally, it supports processing a differently sized "header" first, before +it starts to cut chunks of the desired block size. +The default is not to have a differently sized header. ``borg create --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]`` - BLOCK_SIZE: no default value, multiple of the system page size (usually 4096 bytes) recommended. E.g.: 4194304 would cut 4MiB sized chunks. -- HEADER_SIZE: optional, defaults to 0 (no header chunk). +- HEADER_SIZE: optional, defaults to 0 (no header). + +The fixed chunker also supports processing sparse files (reading only the ranges +with data and seeking over the empty hole ranges). + +``borg create --sparse --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]`` "buzhash" chunker +++++++++++++++++ diff --git a/docs/usage/create.rst b/docs/usage/create.rst index d8e00df21..4b3966a47 100644 --- a/docs/usage/create.rst +++ b/docs/usage/create.rst @@ -43,7 +43,10 @@ Examples $ borg create --chunker-params buzhash,10,23,16,4095 /path/to/repo::small /smallstuff # Backup a raw device (must not be active/in use/mounted at that time) - $ dd if=/dev/sdx bs=4M | borg create --chunker-params fixed,4194304 /path/to/repo::my-sdx - + $ borg create --read-special --chunker-params fixed,4194304 /path/to/repo::my-sdx /dev/sdX + + # Backup a sparse disk image (must not be active/in use/mounted at that time) + $ borg create --sparse --chunker-params fixed,4194304 /path/to/repo::my-disk my-disk.raw # No compression (none) $ borg create --compression none /path/to/repo::arch ~ diff --git a/src/borg/archive.py b/src/borg/archive.py index 708d78b18..1555536d5 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1172,7 +1172,7 @@ class FilesystemObjectProcessors: def __init__(self, *, metadata_collector, cache, key, add_item, process_file_chunks, - chunker_params, show_progress): + chunker_params, show_progress, sparse): self.metadata_collector = metadata_collector self.cache = cache self.key = key @@ -1183,7 +1183,7 @@ class FilesystemObjectProcessors: self.hard_links = {} self.stats = Statistics() # threading: done by cache (including progress) self.cwd = os.getcwd() - self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed) + self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse) @contextmanager def create_helper(self, path, st, status=None, hardlinkable=True): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index c9704a949..651dcf3b4 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -653,7 +653,7 @@ class Archiver: checkpoint_interval=args.checkpoint_interval, rechunkify=False) fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key, process_file_chunks=cp.process_file_chunks, add_item=archive.add_item, - chunker_params=args.chunker_params, show_progress=args.progress) + chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse) create_inner(archive, cache, fso) else: create_inner(None, None, None) @@ -3354,6 +3354,8 @@ class Archiver: help='deprecated, use ``--noflags`` instead') fs_group.add_argument('--noflags', dest='noflags', action='store_true', help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive') + fs_group.add_argument('--sparse', dest='sparse', action='store_true', + help='detect sparse holes in input (supported only by fixed chunker)') fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode', type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI, help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 68f9c010e..3c2e02ff8 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -2,6 +2,7 @@ API_VERSION = '1.2_01' +import errno import os from libc.stdlib cimport free @@ -19,65 +20,170 @@ cdef extern from "_chunker.c": uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h) +def dread(offset, size, fd=None, fh=-1): + use_fh = fh >= 0 + if use_fh: + data = os.read(fh, size) + if hasattr(os, 'posix_fadvise'): + # UNIX only and, in case of block sizes that are not a multiple of the + # system's page size, better be used with a bug fixed linux kernel > 4.6.0, + # see comment/workaround in _chunker.c and borgbackup issue #907. + os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED) + return data + else: + return fd.read(size) + + +def dseek(amount, whence, fd=None, fh=-1): + use_fh = fh >= 0 + if use_fh: + return os.lseek(fh, amount, whence) + else: + return fd.seek(amount, whence) + + +def dpos_curr_end(fd=None, fh=-1): + """ + determine current position, file end position (== file length) + """ + curr = dseek(0, os.SEEK_CUR, fd, fh) + end = dseek(0, os.SEEK_END, fd, fh) + dseek(curr, os.SEEK_SET, fd, fh) + return curr, end + + +def sparsemap(fd=None, fh=-1): + """ + generator yielding a (start, length, is_data) tuple for each range. + is_data is indicating data ranges (True) or hole ranges (False). + + note: + the map is generated starting from the current seek position (it + is not required to be 0 / to be at the start of the file) and + work from there up to the end of the file. + when the generator is finished, the file pointer position will be + reset to where it was before calling this function. + """ + curr, file_len = dpos_curr_end(fd, fh) + start = curr + try: + whence = os.SEEK_HOLE + while True: + is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole + try: + end = dseek(start, whence, fd, fh) + except OSError as e: + if e.errno == errno.ENXIO: + if not is_data and start < file_len: + # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA + # (because we run into ENXIO), thus we must manually deal with this case: + end = file_len + yield (start, end - start, is_data) + break + else: + raise + # we do not want to yield zero-length ranges with start == end: + if end > start: + yield (start, end - start, is_data) + start = end + whence = os.SEEK_DATA if is_data else os.SEEK_HOLE + finally: + # seek to same position as before calling this function + dseek(curr, os.SEEK_SET, fd, fh) + + class ChunkerFixed: """ - Fixed blocksize Chunker, optionally supporting a header block of different size. - - This is a very simple chunker for input data with known block/record sizes: + This is a simple chunker for input data with data usually staying at same + offset and / or with known block/record sizes: - raw disk images - block devices - database files with simple header + fixed-size records layout - Note: the last block of the input data may be less than the block size, + It optionally supports: + + - a header block of different size + - using a sparsemap to only read data ranges and seek over hole ranges + for sparse files. + - using an externally given filemap to only read specific ranges from + a file. + + Note: the last block of a data or hole range may be less than the block size, this is supported and not considered to be an error. """ - def __init__(self, block_size, header_size=0): + def __init__(self, block_size, header_size=0, sparse=False): self.block_size = block_size self.header_size = header_size + # should borg try to do sparse input processing? + # whether it actually can be done depends on the input file being seekable. + self.try_sparse = sparse and hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE') + self.zeros = memoryview(bytes(block_size)) - def chunkify(self, fd, fh=-1): + def chunkify(self, fd=None, fh=-1, fmap=None): """ Cut a file into chunks. :param fd: Python file object :param fh: OS-level file handle (if available), defaults to -1 which means not to use OS-level fd. + :param fmap: a file map, same format as generated by sparsemap """ + if fmap is None: + if self.try_sparse: + try: + if self.header_size > 0: + header_map = [(0, self.header_size, True), ] + dseek(self.header_size, os.SEEK_SET, fd, fh) + body_map = list(sparsemap(fd, fh)) + dseek(0, os.SEEK_SET, fd, fh) + else: + header_map = [] + body_map = list(sparsemap(fd, fh)) + except OSError as err: + # seeking did not work + pass + else: + fmap = header_map + body_map + + if fmap is None: + # either sparse processing (building the fmap) was not tried or it failed. + # in these cases, we just build a "fake fmap" that considers the whole file + # as range(s) of data (no holes), so we can use the same code. + # we build different fmaps here for the purpose of correct block alignment + # with or without a header block (of potentially different size). + if self.header_size > 0: + header_map = [(0, self.header_size, True), ] + body_map = [(self.header_size, 2 ** 62, True), ] + else: + header_map = [] + body_map = [(0, 2 ** 62, True), ] + fmap = header_map + body_map + offset = 0 - use_fh = fh >= 0 - - if use_fh: - def read(size): - nonlocal offset - data = os.read(fh, size) - amount = len(data) - if hasattr(os, 'posix_fadvise'): - # UNIX only and, in case of block sizes that are not a multiple of the - # system's page size, better be used with a bug fixed linux kernel > 4.6.0, - # see comment/workaround in _chunker.c and borgbackup issue #907. - os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED) - offset += amount - return data - else: - def read(size): - nonlocal offset - data = fd.read(size) - amount = len(data) - offset += amount - return data - - if self.header_size > 0: - data = read(self.header_size) - if data: - yield data - else: - data = True # get into next while loop - while data: - data = read(self.block_size) - if data: - yield data - # empty data means we are at EOF and we terminate the generator. + for range_start, range_size, is_data in fmap: + if range_start != offset: + # this is for the case when the fmap does not cover the file completely, + # e.g. it could be without the ranges of holes or of unchanged data. + offset = range_start + dseek(offset, os.SEEK_SET, fd, fh) + while range_size: + wanted = min(range_size, self.block_size) + if is_data: + # read block from the range + data = dread(offset, wanted, fd, fh) + else: # hole + # seek over block from the range + pos = dseek(wanted, os.SEEK_CUR, fd, fh) + data = self.zeros[:pos - offset] # for now, create zero-bytes here + got = len(data) + if got > 0: + offset += got + range_size -= got + yield data # later, use a better api that tags data vs. hole + if got < wanted: + # we did not get enough data, looks like EOF. + return cdef class Chunker: @@ -129,7 +235,8 @@ def get_chunker(algo, *params, **kw): seed = kw['seed'] return Chunker(seed, *params) if algo == 'fixed': - return ChunkerFixed(*params) + sparse = kw['sparse'] + return ChunkerFixed(*params, sparse=sparse) raise TypeError('unsupported chunker algo %r' % algo) diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index c49e5be03..df79441b6 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -22,6 +22,55 @@ class ChunkerFixedTestCase(BaseTestCase): parts = [c for c in chunker.chunkify(BytesIO(data))] self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]]) + def test_chunkify_just_blocks_fmap_complete(self): + data = b'foobar' * 1500 + chunker = ChunkerFixed(4096) + fmap = [ + (0, 4096, True), + (4096, 8192, True), + (8192, 99999999, True), + ] + parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]]) + + def test_chunkify_header_and_blocks_fmap_complete(self): + data = b'foobar' * 1500 + chunker = ChunkerFixed(4096, 123) + fmap = [ + (0, 123, True), + (123, 4096, True), + (123+4096, 4096, True), + (123+8192, 4096, True), + ] + parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]]) + + def test_chunkify_header_and_blocks_fmap_zeros(self): + data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096 + chunker = ChunkerFixed(4096, 123) + fmap = [ + (0, 123, True), + (123, 4096, False), + (123+4096, 4096, True), + (123+8192, 4096, False), + ] + parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + # because we marked the '_' ranges as holes, we will get '\0' ranges instead! + self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096]) + + def test_chunkify_header_and_blocks_fmap_partial(self): + data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096 + chunker = ChunkerFixed(4096, 123) + fmap = [ + (0, 123, True), + # (123, 4096, False), + (123+4096, 4096, True), + # (123+8192, 4096, False), + ] + parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)] + # because we left out the '_' ranges from the fmap, we will not get them at all! + self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]]) + class ChunkerTestCase(BaseTestCase): diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py new file mode 100644 index 000000000..89e120535 --- /dev/null +++ b/src/borg/testsuite/chunker_pytest.py @@ -0,0 +1,120 @@ +from io import BytesIO +import os + +import pytest + +from ..chunker import ChunkerFixed, sparsemap +from ..constants import * # NOQA + +BS = 4096 # fs block size + +# some sparse files. X = content blocks, _ = sparse blocks. +# X__XXX____ +map_sparse1 = [ + (0 * BS, 1 * BS, True), + (1 * BS, 2 * BS, False), + (3 * BS, 3 * BS, True), + (6 * BS, 4 * BS, False), +] + +# _XX___XXXX +map_sparse2 = [ + (0 * BS, 1 * BS, False), + (1 * BS, 2 * BS, True), + (3 * BS, 3 * BS, False), + (6 * BS, 4 * BS, True), +] + +# XXX +map_notsparse = [(0 * BS, 3 * BS, True), ] + +# ___ +map_onlysparse = [(0 * BS, 3 * BS, False), ] + + +def make_sparsefile(fname, sparsemap, header_size=0): + with open(fname, 'wb') as fd: + total = 0 + if header_size: + fd.write(b'H' * header_size) + total += header_size + for offset, size, is_data in sparsemap: + if is_data: + fd.write(b'X' * size) + else: + fd.seek(size, os.SEEK_CUR) + total += size + fd.truncate(total) + assert os.path.getsize(fname) == total + + +def make_content(sparsemap, header_size=0): + with BytesIO() as fd: + total = 0 + if header_size: + fd.write(b'H' * header_size) + total += header_size + for offset, size, is_data in sparsemap: + if is_data: + fd.write(b'X' * size) + else: + fd.write(b'\0' * size) + total += size + content = fd.getvalue() + assert len(content) == total + return content + + +@pytest.mark.parametrize("fname, sparse_map", [ + ('sparse1', map_sparse1), + ('sparse2', map_sparse2), + ('onlysparse', map_onlysparse), + ('notsparse', map_notsparse), +]) +def test_sparsemap(tmpdir, fname, sparse_map): + + def get_sparsemap_fh(fname): + fh = os.open(fname, flags=os.O_RDONLY) + try: + return list(sparsemap(fh=fh)) + finally: + os.close(fh) + + def get_sparsemap_fd(fname): + with open(fname, 'rb') as fd: + return list(sparsemap(fd=fd)) + + fn = str(tmpdir / fname) + make_sparsefile(fn, sparse_map) + assert get_sparsemap_fh(fn) == sparse_map + assert get_sparsemap_fd(fn) == sparse_map + + +@pytest.mark.parametrize("fname, sparse_map, header_size, sparse", [ + ('sparse1', map_sparse1, 0, False), + ('sparse1', map_sparse1, 0, True), + ('sparse1', map_sparse1, BS, False), + ('sparse1', map_sparse1, BS, True), + ('sparse2', map_sparse2, 0, False), + ('sparse2', map_sparse2, 0, True), + ('sparse2', map_sparse2, BS, False), + ('sparse2', map_sparse2, BS, True), + ('onlysparse', map_onlysparse, 0, False), + ('onlysparse', map_onlysparse, 0, True), + ('onlysparse', map_onlysparse, BS, False), + ('onlysparse', map_onlysparse, BS, True), + ('notsparse', map_notsparse, 0, False), + ('notsparse', map_notsparse, 0, True), + ('notsparse', map_notsparse, BS, False), + ('notsparse', map_notsparse, BS, True), +]) +def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse): + + def get_chunks(fname, sparse, header_size): + chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse) + with open(fname, 'rb') as fd: + return b''.join([c for c in chunker.chunkify(fd)]) + + fn = str(tmpdir / fname) + make_sparsefile(fn, sparse_map, header_size=header_size) + get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)