From a27f585eaabd94f1ed76bc82416696326af128b8 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Fri, 31 Mar 2017 12:02:30 +0200 Subject: [PATCH] refactor CompressionDecider2 into a meta Compressor --- src/borg/archive.py | 9 ++-- src/borg/archiver.py | 7 +++- src/borg/compress.pyx | 75 ++++++++++++++++++++++++++++++++++ src/borg/helpers.py | 75 +++------------------------------- src/borg/key.py | 15 +++---- src/borg/testsuite/compress.py | 47 ++++++++++++++++++++- src/borg/testsuite/helpers.py | 32 +-------------- 7 files changed, 142 insertions(+), 118 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 5ed60570a..392efce22 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -21,7 +21,7 @@ logger = create_logger() from . import xattr from .cache import ChunkListEntry from .chunker import Chunker -from .compress import Compressor +from .compress import Compressor, CompressionSpec from .constants import * # NOQA from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Manifest @@ -36,7 +36,7 @@ from .helpers import bin_to_hex from .helpers import safe_ns from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .helpers import PathPrefixPattern, FnmatchPattern -from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec +from .helpers import CompressionDecider1 from .item import Item, ArchiveItem from .key import key_factory from .platform import acl_get, acl_set, set_flags, get_flags, swidth @@ -312,7 +312,6 @@ class Archive: self.chunker = Chunker(self.key.chunk_seed, *chunker_params) self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'), compression_files or []) - key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none')) if name in manifest.archives: raise self.AlreadyExists(name) self.last_checkpoint = time.monotonic() @@ -1585,7 +1584,6 @@ class ArchiveRecreater: self.seen_chunks = set() self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'), compression_files or []) - key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none')) self.dry_run = dry_run self.stats = stats @@ -1663,12 +1661,11 @@ class ArchiveRecreater: if chunk_id in self.seen_chunks: return self.cache.chunk_incref(chunk_id, target.stats) chunk = Chunk(data, compress=compress) - compression_spec, chunk = self.key.compression_decider2.decide(chunk) overwrite = self.recompress if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks: # Check if this chunk is already compressed the way we want it old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False) - if Compressor.detect(old_chunk.data).name == compression_spec.name: + if Compressor.detect(old_chunk.data).name == compress.name: # Stored chunk has the same compression we wanted overwrite = False chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index e241002eb..43aa24c1b 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -34,10 +34,11 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_s from .archive import BackupOSError, backup_io from .cache import Cache from .constants import * # NOQA +from .compress import CompressionSpec from .crc32 import crc32 from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR from .helpers import Error, NoManifestError, set_ec -from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec, ComprSpec +from .helpers import location_validator, archivename_validator, ChunkerParams from .helpers import PrefixSpec, SortBySpec, HUMAN_SORT_KEYS from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter from .helpers import format_time, format_timedelta, format_file_size, format_archive @@ -107,6 +108,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl with repository: if manifest or cache: kwargs['manifest'], kwargs['key'] = Manifest.load(repository) + if args.__dict__.get('compression'): + kwargs['key'].compressor = args.compression.compressor if cache: with Cache(repository, kwargs['key'], kwargs['manifest'], do_files=getattr(args, 'cache_files', False), @@ -2411,7 +2414,7 @@ class Archiver: help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' 'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS) archive_group.add_argument('-C', '--compression', dest='compression', - type=CompressionSpec, default=ComprSpec(name='lz4', spec=None), metavar='COMPRESSION', + type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION', help='select compression algorithm, see the output of the ' '"borg help compression" command for details.') archive_group.add_argument('--compression-from', dest='compression_files', diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 786e19fd1..2da2389e4 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -1,9 +1,12 @@ import zlib +from collections import namedtuple + try: import lzma except ImportError: lzma = None +from .logger import create_logger from .helpers import Buffer, DecompressionError API_VERSION = '1.1_02' @@ -179,12 +182,50 @@ class ZLIB(CompressorBase): raise DecompressionError(str(e)) from None +class Auto(CompressorBase): + """ + Meta-Compressor that decides which compression to use based on LZ4's ratio. + + As a meta-Compressor the actual compression is deferred to other Compressors, + therefore this Compressor has no ID, no detect() and no decompress(). + """ + + ID = None + name = 'auto' + + logger = create_logger('borg.debug.file-compression') + + def __init__(self, compressor): + super().__init__() + self.compressor = compressor + self.lz4 = get_compressor('lz4') + self.none = get_compressor('none') + + def compress(self, data): + lz4_data = self.lz4.compress(data) + if len(lz4_data) < 0.97 * len(data): + return self.compressor.compress(data) + elif len(lz4_data) < len(data): + return lz4_data + else: + return self.none.compress(data) + + def decompress(self, data): + raise NotImplementedError + + def detect(cls, data): + raise NotImplementedError + + +# Maps valid compressor names to their class COMPRESSOR_TABLE = { CNONE.name: CNONE, LZ4.name: LZ4, ZLIB.name: ZLIB, LZMA.name: LZMA, + Auto.name: Auto, } +# List of possible compression types. Does not include Auto, since it is a meta-Compressor. COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first def get_compressor(name, **kwargs): @@ -216,3 +257,37 @@ class Compressor: return cls else: raise ValueError('No decompressor for this data found: %r.', data[:2]) + + +ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor')) + + +def CompressionSpec(s): + values = s.split(',') + count = len(values) + if count < 1: + raise ValueError + # --compression algo[,level] + name = values[0] + if name == 'none': + return ComprSpec(name=name, spec=None, compressor=CNONE()) + elif name == 'lz4': + return ComprSpec(name=name, spec=None, compressor=LZ4()) + if name in ('zlib', 'lzma', ): + if count < 2: + level = 6 # default compression level in py stdlib + elif count == 2: + level = int(values[1]) + if not 0 <= level <= 9: + raise ValueError + else: + raise ValueError + return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level)) + if name == 'auto': + if 2 <= count <= 3: + compression = ','.join(values[1:]) + else: + raise ValueError + inner = CompressionSpec(compression) + return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor)) + raise ValueError diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 2e343e4e7..685d9f43f 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -726,37 +726,6 @@ def ChunkerParams(s): return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size) -ComprSpec = namedtuple('ComprSpec', ('name', 'spec')) - - -def CompressionSpec(s): - values = s.split(',') - count = len(values) - if count < 1: - raise ValueError - # --compression algo[,level] - name = values[0] - if name in ('none', 'lz4', ): - return ComprSpec(name=name, spec=None) - if name in ('zlib', 'lzma', ): - if count < 2: - level = 6 # default compression level in py stdlib - elif count == 2: - level = int(values[1]) - if not 0 <= level <= 9: - raise ValueError - else: - raise ValueError - return ComprSpec(name=name, spec=level) - if name == 'auto': - if 2 <= count <= 3: - compression = ','.join(values[1:]) - else: - raise ValueError - return ComprSpec(name=name, spec=CompressionSpec(compression)) - raise ValueError - - def dir_is_cachedir(path): """Determines whether the specified path is a cache directory (and therefore should potentially be excluded from the backup) according to @@ -2136,11 +2105,12 @@ class CompressionDecider1: :param compression_files: list of compression config files (e.g. from --compression-from) or a list of other line iterators """ - self.compression = compression + from .compress import CompressionSpec + self.compressor = compression.compressor if not compression_files: self.matcher = None else: - self.matcher = PatternMatcher(fallback=compression) + self.matcher = PatternMatcher(fallback=compression.compressor) for file in compression_files: try: for line in clean_lines(file): @@ -2148,7 +2118,7 @@ class CompressionDecider1: compr_spec, fn_pattern = line.split(':', 1) except: continue - self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec)) + self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec).compressor) finally: if hasattr(file, 'close'): file.close() @@ -2156,42 +2126,7 @@ class CompressionDecider1: def decide(self, path): if self.matcher is not None: return self.matcher.match(path) - return self.compression - - -class CompressionDecider2: - logger = create_logger('borg.debug.file-compression') - - def __init__(self, compression): - self.compression = compression - - def decide(self, chunk): - # nothing fancy here yet: we either use what the metadata says or the default - # later, we can decide based on the chunk data also. - # if we compress the data here to decide, we can even update the chunk data - # and modify the metadata as desired. - compr_spec = chunk.meta.get('compress', self.compression) - if compr_spec.name == 'auto': - # we did not decide yet, use heuristic: - compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk) - return compr_spec, chunk - - def heuristic_lz4(self, compr_args, chunk): - from .compress import get_compressor - meta, data = chunk - lz4 = get_compressor('lz4') - cdata = lz4.compress(data) - data_len = len(data) - cdata_len = len(cdata) - if cdata_len < 0.97 * data_len: - compr_spec = compr_args.spec - else: - # uncompressible - we could have a special "uncompressible compressor" - # that marks such data as uncompressible via compression-type metadata. - compr_spec = CompressionSpec('none') - self.logger.debug("len(data) == %d, len(lz4(data)) == %d, ratio == %.3f, choosing %s", data_len, cdata_len, cdata_len/data_len, compr_spec) - meta['compress'] = compr_spec - return compr_spec, Chunk(data, **meta) + return self.compressor class ErrorIgnoringTextIOWrapper(io.TextIOWrapper): diff --git a/src/borg/key.py b/src/borg/key.py index 3d3cfc53e..d3bf22b15 100644 --- a/src/borg/key.py +++ b/src/borg/key.py @@ -13,14 +13,13 @@ from .logger import create_logger logger = create_logger() from .constants import * # NOQA -from .compress import Compressor, get_compressor +from .compress import Compressor from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256, hkdf_hmac_sha512 from .helpers import Chunk, StableDict from .helpers import Error, IntegrityError from .helpers import yes from .helpers import get_keys_dir, get_security_dir from .helpers import bin_to_hex -from .helpers import CompressionDecider2, CompressionSpec from .item import Key, EncryptedKey from .platform import SaveFile from .nonces import NonceManager @@ -143,8 +142,8 @@ class KeyBase: self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compression_decider2 = CompressionDecider2(CompressionSpec('none')) self.compressor = Compressor('none') # for decompression + self.decompress = self.compressor.decompress self.tam_required = True def id_hash(self, data): @@ -152,10 +151,8 @@ class KeyBase: """ def compress(self, chunk): - compr_args, chunk = self.compression_decider2.decide(chunk) - compressor = Compressor(name=compr_args.name, level=compr_args.spec) meta, data = chunk - data = compressor.compress(data) + data = meta.get('compress', self.compressor).compress(data) return Chunk(data, **meta) def encrypt(self, chunk): @@ -268,7 +265,7 @@ class PlaintextKey(KeyBase): payload = memoryview(data)[1:] if not decompress: return Chunk(payload) - data = self.compressor.decompress(payload) + data = self.decompress(payload) self.assert_id(id, data) return Chunk(data) @@ -362,7 +359,7 @@ class AESKeyBase(KeyBase): payload = self.dec_cipher.decrypt(data_view[41:]) if not decompress: return Chunk(payload) - data = self.compressor.decompress(payload) + data = self.decompress(payload) self.assert_id(id, data) return Chunk(data) @@ -757,7 +754,7 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey): payload = memoryview(data)[1:] if not decompress: return Chunk(payload) - data = self.compressor.decompress(payload) + data = self.decompress(payload) self.assert_id(id, data) return Chunk(data) diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index ff9d42713..9bcf595c1 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -7,7 +7,7 @@ except ImportError: import pytest -from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4 +from ..compress import get_compressor, Compressor, CompressionSpec, ComprSpec, CNONE, ZLIB, LZ4, LZMA, Auto buffer = bytes(2**16) @@ -107,3 +107,48 @@ def test_compressor(): for params in params_list: c = Compressor(**params) assert data == c.decompress(c.compress(data)) + + +def test_auto(): + compressor = CompressionSpec('auto,zlib,9').compressor + + compressed = compressor.compress(bytes(500)) + assert Compressor.detect(compressed) == ZLIB + + compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~') + assert Compressor.detect(compressed) == CNONE + + +def test_compression_specs(): + with pytest.raises(ValueError): + CompressionSpec('') + + assert isinstance(CompressionSpec('none').compressor, CNONE) + assert isinstance(CompressionSpec('lz4').compressor, LZ4) + + zlib = CompressionSpec('zlib').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 6 + zlib = CompressionSpec('zlib,0').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 0 + zlib = CompressionSpec('zlib,9').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 9 + with pytest.raises(ValueError): + CompressionSpec('zlib,9,invalid') + + lzma = CompressionSpec('lzma').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 6 + lzma = CompressionSpec('lzma,0').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 0 + lzma = CompressionSpec('lzma,9').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 9 + + with pytest.raises(ValueError): + CompressionSpec('lzma,9,invalid') + with pytest.raises(ValueError): + CompressionSpec('invalid') diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 19c5e9c51..b905a18d5 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -12,6 +12,7 @@ import msgpack import msgpack.fallback from .. import platform +from ..compress import CompressionSpec from ..helpers import Location from ..helpers import Buffer from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders @@ -24,7 +25,7 @@ from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless from ..helpers import load_exclude_file, load_pattern_file -from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2 +from ..helpers import CompressionDecider1 from ..helpers import parse_pattern, PatternMatcher from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern from ..helpers import swidth_slice @@ -698,25 +699,6 @@ def test_pattern_matcher(): assert PatternMatcher(fallback="hey!").fallback == "hey!" -def test_compression_specs(): - with pytest.raises(ValueError): - CompressionSpec('') - assert CompressionSpec('none') == ComprSpec(name='none', spec=None) - assert CompressionSpec('lz4') == ComprSpec(name='lz4', spec=None) - assert CompressionSpec('zlib') == ComprSpec(name='zlib', spec=6) - assert CompressionSpec('zlib,0') == ComprSpec(name='zlib', spec=0) - assert CompressionSpec('zlib,9') == ComprSpec(name='zlib', spec=9) - with pytest.raises(ValueError): - CompressionSpec('zlib,9,invalid') - assert CompressionSpec('lzma') == ComprSpec(name='lzma', spec=6) - assert CompressionSpec('lzma,0') == ComprSpec(name='lzma', spec=0) - assert CompressionSpec('lzma,9') == ComprSpec(name='lzma', spec=9) - with pytest.raises(ValueError): - CompressionSpec('lzma,9,invalid') - with pytest.raises(ValueError): - CompressionSpec('invalid') - - def test_chunkerparams(): assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095) assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095) @@ -1242,16 +1224,6 @@ none:*.zip assert cd.decide('test').name == 'zlib' # no match in conf, use default -def test_compression_decider2(): - default = CompressionSpec('zlib') - - cd = CompressionDecider2(default) - compr_spec, chunk = cd.decide(Chunk(None)) - assert compr_spec.name == 'zlib' - compr_spec, chunk = cd.decide(Chunk(None, compress=CompressionSpec('lzma'))) - assert compr_spec.name == 'lzma' - - def test_format_line(): data = dict(foo='bar baz') assert format_line('', data) == ''