refactor new zero chunk handling to be reusable

This commit is contained in:
Thomas Waldmann 2021-01-08 18:38:35 +01:00
parent 6d0f9a52eb
commit 9fd284ce1a
2 changed files with 31 additions and 19 deletions

View File

@ -19,7 +19,7 @@ from .logger import create_logger
logger = create_logger() logger = create_logger()
from . import xattr from . import xattr
from .chunker import get_chunker, max_chunk_size, Chunk from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data
from .cache import ChunkListEntry from .cache import ChunkListEntry
from .crypto.key import key_factory from .crypto.key import key_factory
from .compress import Compressor, CompressionSpec from .compress import Compressor, CompressionSpec
@ -43,7 +43,6 @@ from .helpers import msgpack
from .helpers import sig_int from .helpers import sig_int
from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
from .item import Item, ArchiveItem, ItemDiff from .item import Item, ArchiveItem, ItemDiff
from .lrucache import LRUCache
from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname
from .remote import cache_if_remote from .remote import cache_if_remote
from .repository import Repository, LIST_SCAN_LIMIT from .repository import Repository, LIST_SCAN_LIMIT
@ -1105,8 +1104,6 @@ class ChunksProcessor:
self.checkpoint_interval = checkpoint_interval self.checkpoint_interval = checkpoint_interval
self.last_checkpoint = time.monotonic() self.last_checkpoint = time.monotonic()
self.rechunkify = rechunkify self.rechunkify = rechunkify
self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None) # length of all-zero chunk -> chunk_id
self.zeros = memoryview(bytes(MAX_DATA_SIZE))
def write_part_file(self, item, from_chunk, number): def write_part_file(self, item, from_chunk, number):
item = Item(internal_dict=item.as_dict()) item = Item(internal_dict=item.as_dict())
@ -1139,20 +1136,7 @@ class ChunksProcessor:
def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None):
if not chunk_processor: if not chunk_processor:
def chunk_processor(chunk): def chunk_processor(chunk):
allocation = chunk.meta['allocation'] chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash)
if allocation == CH_DATA:
data = chunk.data
chunk_id = self.key.id_hash(data)
elif allocation in (CH_HOLE, CH_ALLOC):
size = chunk.meta['size']
data = self.zeros[:size]
try:
chunk_id = self.zero_chunk_ids[size]
except KeyError:
chunk_id = self.key.id_hash(data)
self.zero_chunk_ids[size] = chunk_id
else:
raise ValueError('unexpected allocation type')
chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False) chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False)
self.cache.repository.async_response(wait=False) self.cache.repository.async_response(wait=False)
return chunk_entry return chunk_entry

View File

@ -6,7 +6,8 @@ import errno
import os import os
from collections import namedtuple from collections import namedtuple
from .constants import CH_DATA, CH_ALLOC, CH_HOLE from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE
from .lrucache import LRUCache
from libc.stdlib cimport free from libc.stdlib cimport free
@ -52,6 +53,33 @@ def Chunk(data, **meta):
return _Chunk(meta, data) return _Chunk(meta, data)
zeros = bytes(MAX_DATA_SIZE)
# remember a few recently used all-zero chunk hashes in this mapping.
# (hash_func, chunk_length) -> chunk_hash
# we play safe and have the hash_func in the mapping key, in case we
# have different hash_funcs within the same borg run.
zero_chunk_ids = LRUCache(10, dispose=lambda _: None)
def chunk_to_id_data(chunk, id_hash):
allocation = chunk.meta['allocation']
if allocation == CH_DATA:
data = chunk.data
chunk_id = id_hash(data)
elif allocation in (CH_HOLE, CH_ALLOC):
size = chunk.meta['size']
assert size <= len(zeros)
data = memoryview(zeros)[:size]
try:
chunk_id = zero_chunk_ids[(id_hash, size)]
except KeyError:
chunk_id = id_hash(data)
zero_chunk_ids[(id_hash, size)] = chunk_id
else:
raise ValueError('unexpected allocation type')
return chunk_id, data
def dread(offset, size, fd=None, fh=-1): def dread(offset, size, fd=None, fh=-1):
use_fh = fh >= 0 use_fh = fh >= 0
if use_fh: if use_fh: