detect all-zero chunks, avoid hashing them

comparing zeros is quicker than hashing them.
the comparison should fail quickly inside non-zero data.
This commit is contained in:
Thomas Waldmann 2020-12-15 03:28:48 +01:00
parent 52bd55b29a
commit 6d0f9a52eb
3 changed files with 21 additions and 10 deletions

View File

@ -1143,7 +1143,7 @@ class ChunksProcessor:
if allocation == CH_DATA:
data = chunk.data
chunk_id = self.key.id_hash(data)
elif allocation == CH_HOLE:
elif allocation in (CH_HOLE, CH_ALLOC):
size = chunk.meta['size']
data = self.zeros[:size]
try:
@ -2002,7 +2002,8 @@ class ArchiveRecreater:
target.process_file_chunks(item, self.cache, target.stats, self.progress, chunk_iterator, chunk_processor)
def chunk_processor(self, target, chunk):
# as this is recreate (we do not read from the fs), we never have holes here
# as this is recreate (we do not read from the fs), we never have CH_HOLE here,
# but we need to add support for CH_ALLOC - TODO!
assert chunk.meta['allocation'] == CH_DATA
data = chunk.data
chunk_id = self.key.id_hash(data)

View File

@ -6,7 +6,7 @@ import errno
import os
from collections import namedtuple
from .constants import CH_DATA, CH_HOLE
from .constants import CH_DATA, CH_ALLOC, CH_HOLE
from libc.stdlib cimport free
@ -35,12 +35,16 @@ _Chunk.__doc__ = """\
meta is always a dictionary, data depends on allocation.
on disk data:
meta = {'allocation' = CH_DATA, 'size' = size_of_data }
data chunk read from a DATA range of a file (not from a sparse hole):
meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
data = read_data [bytes or memoryview]
hole in a sparse file:
meta = {'allocation' = CH_HOLE, 'size' = size_of_hole }
all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
data = None
all-zero chunk from a HOLE range of a file (from a sparse hole):
meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
data = None
"""
@ -201,15 +205,21 @@ class ChunkerFixed:
# read block from the range
data = dread(offset, wanted, fd, fh)
got = len(data)
if data == self.zeros[:got]:
data = None
is_zero = True
else:
is_zero = False
else: # hole
# seek over block from the range
pos = dseek(wanted, os.SEEK_CUR, fd, fh)
data = None
got = pos - offset
data = None
is_zero = True
if got > 0:
offset += got
range_size -= got
yield Chunk(data, size=got, allocation=CH_DATA if is_data else CH_HOLE)
yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE)
if got < wanted:
# we did not get enough data, looks like EOF.
return

View File

@ -15,7 +15,7 @@ def cf(chunks):
if chunk.meta['allocation'] == CH_DATA:
assert len(chunk.data) == chunk.meta['size']
return bytes(chunk.data) # make sure we have bytes, not memoryview
if chunk.meta['allocation'] == CH_HOLE:
if chunk.meta['allocation'] in (CH_HOLE, CH_ALLOC):
assert chunk.data is None
return chunk.meta['size']
assert False, "unexpected allocation value"