From 6d0f9a52eb0527f35a36b10a2bfa9e2b9c180fcd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 15 Dec 2020 03:28:48 +0100 Subject: [PATCH] detect all-zero chunks, avoid hashing them comparing zeros is quicker than hashing them. the comparison should fail quickly inside non-zero data. --- src/borg/archive.py | 5 +++-- src/borg/chunker.pyx | 24 +++++++++++++++++------- src/borg/testsuite/chunker.py | 2 +- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 194814687..cc5c33fb9 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1143,7 +1143,7 @@ class ChunksProcessor: if allocation == CH_DATA: data = chunk.data chunk_id = self.key.id_hash(data) - elif allocation == CH_HOLE: + elif allocation in (CH_HOLE, CH_ALLOC): size = chunk.meta['size'] data = self.zeros[:size] try: @@ -2002,7 +2002,8 @@ class ArchiveRecreater: target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor) def chunk_processor(self, target, chunk): - # as this is recreate (we do not read from the fs), we never have holes here + # as this is recreate (we do not read from the fs), we never have CH_HOLE here, + # but we need to add support for CH_ALLOC - TODO! assert chunk.meta['allocation'] == CH_DATA data = chunk.data chunk_id = self.key.id_hash(data) diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 0b6f66546..6bf64784c 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -6,7 +6,7 @@ import errno import os from collections import namedtuple -from .constants import CH_DATA, CH_HOLE +from .constants import CH_DATA, CH_ALLOC, CH_HOLE from libc.stdlib cimport free @@ -35,12 +35,16 @@ _Chunk.__doc__ = """\ meta is always a dictionary, data depends on allocation. - on disk data: - meta = {'allocation' = CH_DATA, 'size' = size_of_data } + data chunk read from a DATA range of a file (not from a sparse hole): + meta = {'allocation' = CH_DATA, 'size' = size_of_chunk } data = read_data [bytes or memoryview] - hole in a sparse file: - meta = {'allocation' = CH_HOLE, 'size' = size_of_hole } + all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero): + meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk } + data = None + + all-zero chunk from a HOLE range of a file (from a sparse hole): + meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk } data = None """ @@ -201,15 +205,21 @@ class ChunkerFixed: # read block from the range data = dread(offset, wanted, fd, fh) got = len(data) + if data == self.zeros[:got]: + data = None + is_zero = True + else: + is_zero = False else: # hole # seek over block from the range pos = dseek(wanted, os.SEEK_CUR, fd, fh) - data = None got = pos - offset + data = None + is_zero = True if got > 0: offset += got range_size -= got - yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE) + yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE) if got < wanted: # we did not get enough data, looks like EOF. return diff --git a/src/borg/testsuite/chunker.py b/src/borg/testsuite/chunker.py index 7a0db7d36..1b275978c 100644 --- a/src/borg/testsuite/chunker.py +++ b/src/borg/testsuite/chunker.py @@ -15,7 +15,7 @@ def cf(chunks): if chunk.meta['allocation'] == CH_DATA: assert len(chunk.data) == chunk.meta['size'] return bytes(chunk.data) # make sure we have bytes, not memoryview - if chunk.meta['allocation'] == CH_HOLE: + if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC): assert chunk.data is None return chunk.meta['size'] assert False, "unexpected allocation value"