Merge pull request #5561 from ThomasWaldmann/sparse-file-support

sparse map / file map support for fixed size chunker
2025-02-23 14:41:43 +00:00 · 2020-12-28 20:21:50 +01:00 · 2020-12-28 20:21:50 +01:00 · 2851a84003
commit 2851a84003
parent 27651f6f28 37a7436ff9
8 changed files with 361 additions and 48 deletions
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@ -596,14 +596,20 @@ The fixed chunker triggers (chunks) at even-spaced offsets, e.g. every 4MiB,
 producing chunks of same block size (the last chunk is not required to be
 full-size).

-Optionally, it can cut the first "header" chunk with a different size (the
-default is not to have a differently sized header chunk).
+Optionally, it supports processing a differently sized "header" first, before
+it starts to cut chunks of the desired block size.
+The default is not to have a differently sized header.

 ``borg create --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``

 - BLOCK_SIZE: no default value, multiple of the system page size (usually 4096
  bytes) recommended. E.g.: 4194304 would cut 4MiB sized chunks.
- HEADER_SIZE: optional, defaults to 0 (no header chunk).
+- HEADER_SIZE: optional, defaults to 0 (no header).
+
+The fixed chunker also supports processing sparse files (reading only the ranges
+with data and seeking over the empty hole ranges).
+
+``borg create --sparse --chunker-params fixed,BLOCK_SIZE[,HEADER_SIZE]``

 "buzhash" chunker
 +++++++++++++++++
--- a/docs/usage/create.rst
+++ b/docs/usage/create.rst
@ -43,7 +43,10 @@ Examples
    $ borg create --chunker-params buzhash,10,23,16,4095 /path/to/repo::small /smallstuff

    # Backup a raw device (must not be active/in use/mounted at that time)
-    $ dd if=/dev/sdx bs=4M | borg create --chunker-params fixed,4194304 /path/to/repo::my-sdx -
+    $ borg create --read-special --chunker-params fixed,4194304 /path/to/repo::my-sdx /dev/sdX
+
+    # Backup a sparse disk image (must not be active/in use/mounted at that time)
+    $ borg create --sparse --chunker-params fixed,4194304 /path/to/repo::my-disk my-disk.raw

    # No compression (none)
    $ borg create --compression none /path/to/repo::arch ~
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1172,7 +1172,7 @@ class FilesystemObjectProcessors:

    def __init__(self, *, metadata_collector, cache, key,
                 add_item, process_file_chunks,
-                 chunker_params, show_progress):
+                 chunker_params, show_progress, sparse):
        self.metadata_collector = metadata_collector
        self.cache = cache
        self.key = key
@ -1183,7 +1183,7 @@ def __init__(self, *, metadata_collector, cache, key,
        self.hard_links = {}
        self.stats = Statistics()  # threading: done by cache (including progress)
        self.cwd = os.getcwd()
-        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
+        self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)

    @contextmanager
    def create_helper(self, path, st, status=None, hardlinkable=True):
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -653,7 +653,7 @@ def create_inner(archive, cache, fso):
                    checkpoint_interval=args.checkpoint_interval, rechunkify=False)
                fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key,
                    process_file_chunks=cp.process_file_chunks, add_item=archive.add_item,
-                    chunker_params=args.chunker_params, show_progress=args.progress)
+                    chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse)
                create_inner(archive, cache, fso)
        else:
            create_inner(None, None, None)
@ -3354,6 +3354,8 @@ def define_borg_mount(parser):
                              help='deprecated, use ``--noflags`` instead')
        fs_group.add_argument('--noflags', dest='noflags', action='store_true',
                              help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive')
+        fs_group.add_argument('--sparse', dest='sparse', action='store_true',
+                               help='detect sparse holes in input (supported only by fixed chunker)')
        fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode',
                              type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI,
                              help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI)
--- a/src/borg/chunker.pyx
+++ b/src/borg/chunker.pyx
@ -2,6 +2,7 @@

 API_VERSION = '1.2_01'

+import errno
 import os

 from libc.stdlib cimport free
@ -19,65 +20,176 @@ cdef extern from "_chunker.c":
    uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)


+# this will be True if Python's seek implementation supports data/holes seeking.
+# this does not imply that it will actually work on the filesystem,
+# because the FS also needs to support this.
+has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
+
+
+def dread(offset, size, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        data = os.read(fh, size)
+        if hasattr(os, 'posix_fadvise'):
+            # UNIX only and, in case of block sizes that are not a multiple of the
+            # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
+            # see comment/workaround in _chunker.c and borgbackup issue #907.
+            os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
+        return data
+    else:
+        return fd.read(size)
+
+
+def dseek(amount, whence, fd=None, fh=-1):
+    use_fh = fh >= 0
+    if use_fh:
+        return os.lseek(fh, amount, whence)
+    else:
+        return fd.seek(amount, whence)
+
+
+def dpos_curr_end(fd=None, fh=-1):
+    """
+    determine current position, file end position (== file length)
+    """
+    curr = dseek(0, os.SEEK_CUR, fd, fh)
+    end = dseek(0, os.SEEK_END, fd, fh)
+    dseek(curr, os.SEEK_SET, fd, fh)
+    return curr, end
+
+
+def sparsemap(fd=None, fh=-1):
+    """
+    generator yielding a (start, length, is_data) tuple for each range.
+    is_data is indicating data ranges (True) or hole ranges (False).
+
+    note:
+    the map is generated starting from the current seek position (it
+    is not required to be 0 / to be at the start of the file) and
+    work from there up to the end of the file.
+    when the generator is finished, the file pointer position will be
+    reset to where it was before calling this function.
+    """
+    curr, file_len = dpos_curr_end(fd, fh)
+    start = curr
+    try:
+        whence = os.SEEK_HOLE
+        while True:
+            is_data = whence == os.SEEK_HOLE  # True: range with data, False: range is a hole
+            try:
+                end = dseek(start, whence, fd, fh)
+            except OSError as e:
+                if e.errno == errno.ENXIO:
+                    if not is_data and start < file_len:
+                        # if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
+                        # (because we run into ENXIO), thus we must manually deal with this case:
+                        end = file_len
+                        yield (start, end - start, is_data)
+                    break
+                else:
+                    raise
+            # we do not want to yield zero-length ranges with start == end:
+            if end > start:
+                yield (start, end - start, is_data)
+            start = end
+            whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
+    finally:
+        # seek to same position as before calling this function
+        dseek(curr, os.SEEK_SET, fd, fh)
+
+
 class ChunkerFixed:
    """
-    Fixed blocksize Chunker, optionally supporting a header block of different size.
-
-    This is a very simple chunker for input data with known block/record sizes:
+    This is a simple chunker for input data with data usually staying at same
+    offset and / or with known block/record sizes:

    - raw disk images
    - block devices
    - database files with simple header + fixed-size records layout

-    Note: the last block of the input data may be less than the block size,
+    It optionally supports:
+
+    - a header block of different size
+    - using a sparsemap to only read data ranges and seek over hole ranges
+      for sparse files.
+    - using an externally given filemap to only read specific ranges from
+      a file.
+
+    Note: the last block of a data or hole range may be less than the block size,
          this is supported and not considered to be an error.
    """
-    def __init__(self, block_size, header_size=0):
+    def __init__(self, block_size, header_size=0, sparse=False):
        self.block_size = block_size
        self.header_size = header_size
+        # should borg try to do sparse input processing?
+        # whether it actually can be done depends on the input file being seekable.
+        self.try_sparse = sparse and has_seek_hole
+        self.zeros = memoryview(bytes(block_size))

-    def chunkify(self, fd, fh=-1):
+    def chunkify(self, fd=None, fh=-1, fmap=None):
        """
        Cut a file into chunks.

        :param fd: Python file object
        :param fh: OS-level file handle (if available),
                   defaults to -1 which means not to use OS-level fd.
+        :param fmap: a file map, same format as generated by sparsemap
        """
+        if fmap is None:
+            if self.try_sparse:
+                try:
+                    if self.header_size > 0:
+                        header_map = [(0, self.header_size, True), ]
+                        dseek(self.header_size, os.SEEK_SET, fd, fh)
+                        body_map = list(sparsemap(fd, fh))
+                        dseek(0, os.SEEK_SET, fd, fh)
+                    else:
+                        header_map = []
+                        body_map = list(sparsemap(fd, fh))
+                except OSError as err:
+                    # seeking did not work
+                    pass
+                else:
+                    fmap = header_map + body_map
+
+            if fmap is None:
+                # either sparse processing (building the fmap) was not tried or it failed.
+                # in these cases, we just build a "fake fmap" that considers the whole file
+                # as range(s) of data (no holes), so we can use the same code.
+                # we build different fmaps here for the purpose of correct block alignment
+                # with or without a header block (of potentially different size).
+                if self.header_size > 0:
+                    header_map = [(0, self.header_size, True), ]
+                    body_map = [(self.header_size, 2 ** 62, True), ]
+                else:
+                    header_map = []
+                    body_map = [(0, 2 ** 62, True), ]
+                fmap = header_map + body_map
+
        offset = 0
-        use_fh = fh >= 0
-
-        if use_fh:
-            def read(size):
-                nonlocal offset
-                data = os.read(fh, size)
-                amount = len(data)
-                if hasattr(os, 'posix_fadvise'):
-                    # UNIX only and, in case of block sizes that are not a multiple of the
-                    # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
-                    # see comment/workaround in _chunker.c and borgbackup issue #907.
-                    os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
-                offset += amount
-                return data
-        else:
-            def read(size):
-                nonlocal offset
-                data = fd.read(size)
-                amount = len(data)
-                offset += amount
-                return data
-
-        if self.header_size > 0:
-            data = read(self.header_size)
-            if data:
-                yield data
-        else:
-            data = True  # get into next while loop
-        while data:
-            data = read(self.block_size)
-            if data:
-                yield data
-        # empty data means we are at EOF and we terminate the generator.
+        for range_start, range_size, is_data in fmap:
+            if range_start != offset:
+                # this is for the case when the fmap does not cover the file completely,
+                # e.g. it could be without the ranges of holes or of unchanged data.
+                offset = range_start
+                dseek(offset, os.SEEK_SET, fd, fh)
+            while range_size:
+                wanted = min(range_size, self.block_size)
+                if is_data:
+                    # read block from the range
+                    data = dread(offset, wanted, fd, fh)
+                else:  # hole
+                    # seek over block from the range
+                    pos = dseek(wanted, os.SEEK_CUR, fd, fh)
+                    data = self.zeros[:pos - offset]  # for now, create zero-bytes here
+                got = len(data)
+                if got > 0:
+                    offset += got
+                    range_size -= got
+                    yield data  # later, use a better api that tags data vs. hole
+                if got < wanted:
+                    # we did not get enough data, looks like EOF.
+                    return


 cdef class Chunker:
@ -129,7 +241,8 @@ def get_chunker(algo, *params, **kw):
        seed = kw['seed']
        return Chunker(seed, *params)
    if algo == 'fixed':
-        return ChunkerFixed(*params)
+        sparse = kw['sparse']
+        return ChunkerFixed(*params, sparse=sparse)
    raise TypeError('unsupported chunker algo %r' % algo)


--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -32,6 +32,7 @@
 from ..archive import Archive, ChunkBuffer
 from ..archiver import Archiver, parse_storage_quota, PURE_PYTHON_MSGPACK_WARNING
 from ..cache import Cache, LocalCache
+from ..chunker import has_seek_hole
 from ..constants import *  # NOQA
 from ..crypto.low_level import bytes_to_long, num_cipher_blocks
 from ..crypto.key import KeyfileKeyBase, RepoKey, KeyfileKey, Passphrase, TAMRequiredError
@ -563,7 +564,7 @@ def is_sparse(fn, total_size, hole_size):
            sparse = True
            if sparse and hasattr(st, 'st_blocks') and st.st_blocks * 512 >= st.st_size:
                sparse = False
-            if sparse and hasattr(os, 'SEEK_HOLE') and hasattr(os, 'SEEK_DATA'):
+            if sparse and has_seek_hole:
                with open(fn, 'rb') as fd:
                    # only check if the first hole is as expected, because the 2nd hole check
                    # is problematic on xfs due to its "dynamic speculative EOF preallocation
--- a/src/borg/testsuite/chunker.py
+++ b/src/borg/testsuite/chunker.py
@ -22,6 +22,55 @@ def test_chunkify_header_and_blocks(self):
        parts = [c for c in chunker.chunkify(BytesIO(data))]
        self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])

+    def test_chunkify_just_blocks_fmap_complete(self):
+        data = b'foobar' * 1500
+        chunker = ChunkerFixed(4096)
+        fmap = [
+            (0, 4096, True),
+            (4096, 8192, True),
+            (8192, 99999999, True),
+        ]
+        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        self.assert_equal(parts, [data[0:4096], data[4096:8192], data[8192:]])
+
+    def test_chunkify_header_and_blocks_fmap_complete(self):
+        data = b'foobar' * 1500
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [
+            (0, 123, True),
+            (123, 4096, True),
+            (123+4096, 4096, True),
+            (123+8192, 4096, True),
+        ]
+        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        self.assert_equal(parts, [data[0:123], data[123:123+4096], data[123+4096:123+8192], data[123+8192:]])
+
+    def test_chunkify_header_and_blocks_fmap_zeros(self):
+        data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [
+            (0, 123, True),
+            (123, 4096, False),
+            (123+4096, 4096, True),
+            (123+8192, 4096, False),
+        ]
+        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        # because we marked the '_' ranges as holes, we will get '\0' ranges instead!
+        self.assert_equal(parts, [data[0:123], b'\0' * 4096, data[123+4096:123+8192], b'\0' * 4096])
+
+    def test_chunkify_header_and_blocks_fmap_partial(self):
+        data = b'H' * 123 + b'_' * 4096 + b'X' * 4096 + b'_' * 4096
+        chunker = ChunkerFixed(4096, 123)
+        fmap = [
+            (0, 123, True),
+            # (123, 4096, False),
+            (123+4096, 4096, True),
+            # (123+8192, 4096, False),
+        ]
+        parts = [c for c in chunker.chunkify(BytesIO(data), fmap=fmap)]
+        # because we left out the '_' ranges from the fmap, we will not get them at all!
+        self.assert_equal(parts, [data[0:123], data[123+4096:123+8192]])
+

 class ChunkerTestCase(BaseTestCase):

--- a/src/borg/testsuite/chunker_pytest.py
+++ b/src/borg/testsuite/chunker_pytest.py
@ -0,0 +1,139 @@
+from io import BytesIO
+import os
+import tempfile
+
+import pytest
+
+from ..chunker import ChunkerFixed, sparsemap, has_seek_hole
+from ..constants import *  # NOQA
+
+BS = 4096  # fs block size
+
+# some sparse files. X = content blocks, _ = sparse blocks.
+# X__XXX____
+map_sparse1 = [
+    (0 * BS, 1 * BS, True),
+    (1 * BS, 2 * BS, False),
+    (3 * BS, 3 * BS, True),
+    (6 * BS, 4 * BS, False),
+]
+
+# _XX___XXXX
+map_sparse2 = [
+    (0 * BS, 1 * BS, False),
+    (1 * BS, 2 * BS, True),
+    (3 * BS, 3 * BS, False),
+    (6 * BS, 4 * BS, True),
+]
+
+# XXX
+map_notsparse = [(0 * BS, 3 * BS, True), ]
+
+# ___
+map_onlysparse = [(0 * BS, 3 * BS, False), ]
+
+
+def make_sparsefile(fname, sparsemap, header_size=0):
+    with open(fname, 'wb') as fd:
+        total = 0
+        if header_size:
+            fd.write(b'H' * header_size)
+            total += header_size
+        for offset, size, is_data in sparsemap:
+            if is_data:
+                fd.write(b'X' * size)
+            else:
+                fd.seek(size, os.SEEK_CUR)
+            total += size
+        fd.truncate(total)
+    assert os.path.getsize(fname) == total
+
+
+def make_content(sparsemap, header_size=0):
+    with BytesIO() as fd:
+        total = 0
+        if header_size:
+            fd.write(b'H' * header_size)
+            total += header_size
+        for offset, size, is_data in sparsemap:
+            if is_data:
+                fd.write(b'X' * size)
+            else:
+                fd.write(b'\0' * size)
+            total += size
+        content = fd.getvalue()
+    assert len(content) == total
+    return content
+
+
+def fs_supports_sparse():
+    if not has_seek_hole:
+        return False
+    with tempfile.TemporaryDirectory() as tmpdir:
+        fn = os.path.join(tmpdir, 'test_sparse')
+        make_sparsefile(fn, [(0, BS, False), (BS, BS, True)])
+        with open(fn, 'rb') as f:
+            try:
+                offset_hole = f.seek(0, os.SEEK_HOLE)
+                offset_data = f.seek(0, os.SEEK_DATA)
+            except OSError:
+                # no sparse support if these seeks do not work
+                return False
+        return offset_hole == 0 and offset_data == BS
+
+
+@pytest.mark.skipif(not fs_supports_sparse(), reason='fs does not support sparse files')
+@pytest.mark.parametrize("fname, sparse_map", [
+    ('sparse1', map_sparse1),
+    ('sparse2', map_sparse2),
+    ('onlysparse', map_onlysparse),
+    ('notsparse', map_notsparse),
+])
+def test_sparsemap(tmpdir, fname, sparse_map):
+
+    def get_sparsemap_fh(fname):
+        fh = os.open(fname, flags=os.O_RDONLY)
+        try:
+            return list(sparsemap(fh=fh))
+        finally:
+            os.close(fh)
+
+    def get_sparsemap_fd(fname):
+        with open(fname, 'rb') as fd:
+            return list(sparsemap(fd=fd))
+
+    fn = str(tmpdir / fname)
+    make_sparsefile(fn, sparse_map)
+    assert get_sparsemap_fh(fn) == sparse_map
+    assert get_sparsemap_fd(fn) == sparse_map
+
+
+@pytest.mark.skipif(not fs_supports_sparse(), reason='fs does not support sparse files')
+@pytest.mark.parametrize("fname, sparse_map, header_size, sparse", [
+    ('sparse1', map_sparse1, 0, False),
+    ('sparse1', map_sparse1, 0, True),
+    ('sparse1', map_sparse1, BS, False),
+    ('sparse1', map_sparse1, BS, True),
+    ('sparse2', map_sparse2, 0, False),
+    ('sparse2', map_sparse2, 0, True),
+    ('sparse2', map_sparse2, BS, False),
+    ('sparse2', map_sparse2, BS, True),
+    ('onlysparse', map_onlysparse, 0, False),
+    ('onlysparse', map_onlysparse, 0, True),
+    ('onlysparse', map_onlysparse, BS, False),
+    ('onlysparse', map_onlysparse, BS, True),
+    ('notsparse', map_notsparse, 0, False),
+    ('notsparse', map_notsparse, 0, True),
+    ('notsparse', map_notsparse, BS, False),
+    ('notsparse', map_notsparse, BS, True),
+])
+def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):
+
+    def get_chunks(fname, sparse, header_size):
+        chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
+        with open(fname, 'rb') as fd:
+            return b''.join([c for c in chunker.chunkify(fd)])
+
+    fn = str(tmpdir / fname)
+    make_sparsefile(fn, sparse_map, header_size=header_size)
+    get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)