From 9fd284ce1a9c310571049aa1d7ad0a6fa89b8a26 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 8 Jan 2021 18:38:35 +0100 Subject: [PATCH] refactor new zero chunk handling to be reusable --- src/borg/archive.py | 20 ++------------------ src/borg/chunker.pyx | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index cc5c33fb9..eff10b4bf 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -19,7 +19,7 @@ from .logger import create_logger logger = create_logger() from . import xattr -from .chunker import get_chunker, max_chunk_size, Chunk +from .chunker import get_chunker, max_chunk_size, Chunk, chunk_to_id_data from .cache import ChunkListEntry from .crypto.key import key_factory from .compress import Compressor, CompressionSpec @@ -43,7 +43,6 @@ from .helpers import msgpack from .helpers import sig_int from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .item import Item, ArchiveItem, ItemDiff -from .lrucache import LRUCache from .platform import acl_get, acl_set, set_flags, get_flags, swidth, hostname from .remote import cache_if_remote from .repository import Repository, LIST_SCAN_LIMIT @@ -1105,8 +1104,6 @@ class ChunksProcessor: self.checkpoint_interval = checkpoint_interval self.last_checkpoint = time.monotonic() self.rechunkify = rechunkify - self.zero_chunk_ids = LRUCache(10, dispose=lambda _: None) # length of all-zero chunk -> chunk_id - self.zeros = memoryview(bytes(MAX_DATA_SIZE)) def write_part_file(self, item, from_chunk, number): item = Item(internal_dict=item.as_dict()) @@ -1139,20 +1136,7 @@ class ChunksProcessor: def process_file_chunks(self, item, cache, stats, show_progress, chunk_iter, chunk_processor=None): if not chunk_processor: def chunk_processor(chunk): - allocation = chunk.meta['allocation'] - if allocation == CH_DATA: - data = chunk.data - chunk_id = self.key.id_hash(data) - elif allocation in (CH_HOLE, CH_ALLOC): - size = chunk.meta['size'] - data = self.zeros[:size] - try: - chunk_id = self.zero_chunk_ids[size] - except KeyError: - chunk_id = self.key.id_hash(data) - self.zero_chunk_ids[size] = chunk_id - else: - raise ValueError('unexpected allocation type') + chunk_id, data = chunk_to_id_data(chunk, self.key.id_hash) chunk_entry = cache.add_chunk(chunk_id, data, stats, wait=False) self.cache.repository.async_response(wait=False) return chunk_entry diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index 6bf64784c..1fd316a03 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -6,7 +6,8 @@ import errno import os from collections import namedtuple -from .constants import CH_DATA, CH_ALLOC, CH_HOLE +from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE +from .lrucache import LRUCache from libc.stdlib cimport free @@ -52,6 +53,33 @@ def Chunk(data, **meta): return _Chunk(meta, data) +zeros = bytes(MAX_DATA_SIZE) + +# remember a few recently used all-zero chunk hashes in this mapping. +# (hash_func, chunk_length) -> chunk_hash +# we play safe and have the hash_func in the mapping key, in case we +# have different hash_funcs within the same borg run. +zero_chunk_ids = LRUCache(10, dispose=lambda _: None) + +def chunk_to_id_data(chunk, id_hash): + allocation = chunk.meta['allocation'] + if allocation == CH_DATA: + data = chunk.data + chunk_id = id_hash(data) + elif allocation in (CH_HOLE, CH_ALLOC): + size = chunk.meta['size'] + assert size <= len(zeros) + data = memoryview(zeros)[:size] + try: + chunk_id = zero_chunk_ids[(id_hash, size)] + except KeyError: + chunk_id = id_hash(data) + zero_chunk_ids[(id_hash, size)] = chunk_id + else: + raise ValueError('unexpected allocation type') + return chunk_id, data + + def dread(offset, size, fd=None, fh=-1): use_fh = fh >= 0 if use_fh: