refactor CompressionDecider2 into a meta Compressor

This commit is contained in:
Marian Beermann 2017-03-31 12:02:30 +02:00
parent d79da81d22
commit a27f585eaa
7 changed files with 142 additions and 118 deletions

View File

@ -21,7 +21,7 @@ logger = create_logger()
from . import xattr
from .cache import ChunkListEntry
from .chunker import Chunker
from .compress import Compressor
from .compress import Compressor, CompressionSpec
from .constants import * # NOQA
from .hashindex import ChunkIndex, ChunkIndexEntry
from .helpers import Manifest
@ -36,7 +36,7 @@ from .helpers import bin_to_hex
from .helpers import safe_ns
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
from .helpers import PathPrefixPattern, FnmatchPattern
from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec
from .helpers import CompressionDecider1
from .item import Item, ArchiveItem
from .key import key_factory
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
@ -312,7 +312,6 @@ class Archive:
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
compression_files or [])
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.monotonic()
@ -1585,7 +1584,6 @@ class ArchiveRecreater:
self.seen_chunks = set()
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
compression_files or [])
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
self.dry_run = dry_run
self.stats = stats
@ -1663,12 +1661,11 @@ class ArchiveRecreater:
if chunk_id in self.seen_chunks:
return self.cache.chunk_incref(chunk_id, target.stats)
chunk = Chunk(data, compress=compress)
compression_spec, chunk = self.key.compression_decider2.decide(chunk)
overwrite = self.recompress
if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
# Check if this chunk is already compressed the way we want it
old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
if Compressor.detect(old_chunk.data).name == compression_spec.name:
if Compressor.detect(old_chunk.data).name == compress.name:
# Stored chunk has the same compression we wanted
overwrite = False
chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False)

View File

@ -34,10 +34,11 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_s
from .archive import BackupOSError, backup_io
from .cache import Cache
from .constants import * # NOQA
from .compress import CompressionSpec
from .crc32 import crc32
from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR
from .helpers import Error, NoManifestError, set_ec
from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec, ComprSpec
from .helpers import location_validator, archivename_validator, ChunkerParams
from .helpers import PrefixSpec, SortBySpec, HUMAN_SORT_KEYS
from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter
from .helpers import format_time, format_timedelta, format_file_size, format_archive
@ -107,6 +108,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl
with repository:
if manifest or cache:
kwargs['manifest'], kwargs['key'] = Manifest.load(repository)
if args.__dict__.get('compression'):
kwargs['key'].compressor = args.compression.compressor
if cache:
with Cache(repository, kwargs['key'], kwargs['manifest'],
do_files=getattr(args, 'cache_files', False),
@ -2411,7 +2414,7 @@ class Archiver:
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
archive_group.add_argument('-C', '--compression', dest='compression',
type=CompressionSpec, default=ComprSpec(name='lz4', spec=None), metavar='COMPRESSION',
type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION',
help='select compression algorithm, see the output of the '
'"borg help compression" command for details.')
archive_group.add_argument('--compression-from', dest='compression_files',

View File

@ -1,9 +1,12 @@
import zlib
from collections import namedtuple
try:
import lzma
except ImportError:
lzma = None
from .logger import create_logger
from .helpers import Buffer, DecompressionError
API_VERSION = '1.1_02'
@ -179,12 +182,50 @@ class ZLIB(CompressorBase):
raise DecompressionError(str(e)) from None
class Auto(CompressorBase):
"""
Meta-Compressor that decides which compression to use based on LZ4's ratio.
As a meta-Compressor the actual compression is deferred to other Compressors,
therefore this Compressor has no ID, no detect() and no decompress().
"""
ID = None
name = 'auto'
logger = create_logger('borg.debug.file-compression')
def __init__(self, compressor):
super().__init__()
self.compressor = compressor
self.lz4 = get_compressor('lz4')
self.none = get_compressor('none')
def compress(self, data):
lz4_data = self.lz4.compress(data)
if len(lz4_data) < 0.97 * len(data):
return self.compressor.compress(data)
elif len(lz4_data) < len(data):
return lz4_data
else:
return self.none.compress(data)
def decompress(self, data):
raise NotImplementedError
def detect(cls, data):
raise NotImplementedError
# Maps valid compressor names to their class
COMPRESSOR_TABLE = {
CNONE.name: CNONE,
LZ4.name: LZ4,
ZLIB.name: ZLIB,
LZMA.name: LZMA,
Auto.name: Auto,
}
# List of possible compression types. Does not include Auto, since it is a meta-Compressor.
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
def get_compressor(name, **kwargs):
@ -216,3 +257,37 @@ class Compressor:
return cls
else:
raise ValueError('No decompressor for this data found: %r.', data[:2])
ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor'))
def CompressionSpec(s):
values = s.split(',')
count = len(values)
if count < 1:
raise ValueError
# --compression algo[,level]
name = values[0]
if name == 'none':
return ComprSpec(name=name, spec=None, compressor=CNONE())
elif name == 'lz4':
return ComprSpec(name=name, spec=None, compressor=LZ4())
if name in ('zlib', 'lzma', ):
if count < 2:
level = 6 # default compression level in py stdlib
elif count == 2:
level = int(values[1])
if not 0 <= level <= 9:
raise ValueError
else:
raise ValueError
return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level))
if name == 'auto':
if 2 <= count <= 3:
compression = ','.join(values[1:])
else:
raise ValueError
inner = CompressionSpec(compression)
return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor))
raise ValueError

View File

@ -726,37 +726,6 @@ def ChunkerParams(s):
return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
ComprSpec = namedtuple('ComprSpec', ('name', 'spec'))
def CompressionSpec(s):
values = s.split(',')
count = len(values)
if count < 1:
raise ValueError
# --compression algo[,level]
name = values[0]
if name in ('none', 'lz4', ):
return ComprSpec(name=name, spec=None)
if name in ('zlib', 'lzma', ):
if count < 2:
level = 6 # default compression level in py stdlib
elif count == 2:
level = int(values[1])
if not 0 <= level <= 9:
raise ValueError
else:
raise ValueError
return ComprSpec(name=name, spec=level)
if name == 'auto':
if 2 <= count <= 3:
compression = ','.join(values[1:])
else:
raise ValueError
return ComprSpec(name=name, spec=CompressionSpec(compression))
raise ValueError
def dir_is_cachedir(path):
"""Determines whether the specified path is a cache directory (and
therefore should potentially be excluded from the backup) according to
@ -2136,11 +2105,12 @@ class CompressionDecider1:
:param compression_files: list of compression config files (e.g. from --compression-from) or
a list of other line iterators
"""
self.compression = compression
from .compress import CompressionSpec
self.compressor = compression.compressor
if not compression_files:
self.matcher = None
else:
self.matcher = PatternMatcher(fallback=compression)
self.matcher = PatternMatcher(fallback=compression.compressor)
for file in compression_files:
try:
for line in clean_lines(file):
@ -2148,7 +2118,7 @@ class CompressionDecider1:
compr_spec, fn_pattern = line.split(':', 1)
except:
continue
self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec))
self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec).compressor)
finally:
if hasattr(file, 'close'):
file.close()
@ -2156,42 +2126,7 @@ class CompressionDecider1:
def decide(self, path):
if self.matcher is not None:
return self.matcher.match(path)
return self.compression
class CompressionDecider2:
logger = create_logger('borg.debug.file-compression')
def __init__(self, compression):
self.compression = compression
def decide(self, chunk):
# nothing fancy here yet: we either use what the metadata says or the default
# later, we can decide based on the chunk data also.
# if we compress the data here to decide, we can even update the chunk data
# and modify the metadata as desired.
compr_spec = chunk.meta.get('compress', self.compression)
if compr_spec.name == 'auto':
# we did not decide yet, use heuristic:
compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk)
return compr_spec, chunk
def heuristic_lz4(self, compr_args, chunk):
from .compress import get_compressor
meta, data = chunk
lz4 = get_compressor('lz4')
cdata = lz4.compress(data)
data_len = len(data)
cdata_len = len(cdata)
if cdata_len < 0.97 * data_len:
compr_spec = compr_args.spec
else:
# uncompressible - we could have a special "uncompressible compressor"
# that marks such data as uncompressible via compression-type metadata.
compr_spec = CompressionSpec('none')
self.logger.debug("len(data) == %d, len(lz4(data)) == %d, ratio == %.3f, choosing %s", data_len, cdata_len, cdata_len/data_len, compr_spec)
meta['compress'] = compr_spec
return compr_spec, Chunk(data, **meta)
return self.compressor
class ErrorIgnoringTextIOWrapper(io.TextIOWrapper):

View File

@ -13,14 +13,13 @@ from .logger import create_logger
logger = create_logger()
from .constants import * # NOQA
from .compress import Compressor, get_compressor
from .compress import Compressor
from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256, hkdf_hmac_sha512
from .helpers import Chunk, StableDict
from .helpers import Error, IntegrityError
from .helpers import yes
from .helpers import get_keys_dir, get_security_dir
from .helpers import bin_to_hex
from .helpers import CompressionDecider2, CompressionSpec
from .item import Key, EncryptedKey
from .platform import SaveFile
from .nonces import NonceManager
@ -143,8 +142,8 @@ class KeyBase:
self.TYPE_STR = bytes([self.TYPE])
self.repository = repository
self.target = None # key location file path / repo obj
self.compression_decider2 = CompressionDecider2(CompressionSpec('none'))
self.compressor = Compressor('none') # for decompression
self.decompress = self.compressor.decompress
self.tam_required = True
def id_hash(self, data):
@ -152,10 +151,8 @@ class KeyBase:
"""
def compress(self, chunk):
compr_args, chunk = self.compression_decider2.decide(chunk)
compressor = Compressor(name=compr_args.name, level=compr_args.spec)
meta, data = chunk
data = compressor.compress(data)
data = meta.get('compress', self.compressor).compress(data)
return Chunk(data, **meta)
def encrypt(self, chunk):
@ -268,7 +265,7 @@ class PlaintextKey(KeyBase):
payload = memoryview(data)[1:]
if not decompress:
return Chunk(payload)
data = self.compressor.decompress(payload)
data = self.decompress(payload)
self.assert_id(id, data)
return Chunk(data)
@ -362,7 +359,7 @@ class AESKeyBase(KeyBase):
payload = self.dec_cipher.decrypt(data_view[41:])
if not decompress:
return Chunk(payload)
data = self.compressor.decompress(payload)
data = self.decompress(payload)
self.assert_id(id, data)
return Chunk(data)
@ -757,7 +754,7 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey):
payload = memoryview(data)[1:]
if not decompress:
return Chunk(payload)
data = self.compressor.decompress(payload)
data = self.decompress(payload)
self.assert_id(id, data)
return Chunk(data)

View File

@ -7,7 +7,7 @@ except ImportError:
import pytest
from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4
from ..compress import get_compressor, Compressor, CompressionSpec, ComprSpec, CNONE, ZLIB, LZ4, LZMA, Auto
buffer = bytes(2**16)
@ -107,3 +107,48 @@ def test_compressor():
for params in params_list:
c = Compressor(**params)
assert data == c.decompress(c.compress(data))
def test_auto():
compressor = CompressionSpec('auto,zlib,9').compressor
compressed = compressor.compress(bytes(500))
assert Compressor.detect(compressed) == ZLIB
compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~')
assert Compressor.detect(compressed) == CNONE
def test_compression_specs():
with pytest.raises(ValueError):
CompressionSpec('')
assert isinstance(CompressionSpec('none').compressor, CNONE)
assert isinstance(CompressionSpec('lz4').compressor, LZ4)
zlib = CompressionSpec('zlib').compressor
assert isinstance(zlib, ZLIB)
assert zlib.level == 6
zlib = CompressionSpec('zlib,0').compressor
assert isinstance(zlib, ZLIB)
assert zlib.level == 0
zlib = CompressionSpec('zlib,9').compressor
assert isinstance(zlib, ZLIB)
assert zlib.level == 9
with pytest.raises(ValueError):
CompressionSpec('zlib,9,invalid')
lzma = CompressionSpec('lzma').compressor
assert isinstance(lzma, LZMA)
assert lzma.level == 6
lzma = CompressionSpec('lzma,0').compressor
assert isinstance(lzma, LZMA)
assert lzma.level == 0
lzma = CompressionSpec('lzma,9').compressor
assert isinstance(lzma, LZMA)
assert lzma.level == 9
with pytest.raises(ValueError):
CompressionSpec('lzma,9,invalid')
with pytest.raises(ValueError):
CompressionSpec('invalid')

View File

@ -12,6 +12,7 @@ import msgpack
import msgpack.fallback
from .. import platform
from ..compress import CompressionSpec
from ..helpers import Location
from ..helpers import Buffer
from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders
@ -24,7 +25,7 @@ from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex
from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk
from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless
from ..helpers import load_exclude_file, load_pattern_file
from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2
from ..helpers import CompressionDecider1
from ..helpers import parse_pattern, PatternMatcher
from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern
from ..helpers import swidth_slice
@ -698,25 +699,6 @@ def test_pattern_matcher():
assert PatternMatcher(fallback="hey!").fallback == "hey!"
def test_compression_specs():
with pytest.raises(ValueError):
CompressionSpec('')
assert CompressionSpec('none') == ComprSpec(name='none', spec=None)
assert CompressionSpec('lz4') == ComprSpec(name='lz4', spec=None)
assert CompressionSpec('zlib') == ComprSpec(name='zlib', spec=6)
assert CompressionSpec('zlib,0') == ComprSpec(name='zlib', spec=0)
assert CompressionSpec('zlib,9') == ComprSpec(name='zlib', spec=9)
with pytest.raises(ValueError):
CompressionSpec('zlib,9,invalid')
assert CompressionSpec('lzma') == ComprSpec(name='lzma', spec=6)
assert CompressionSpec('lzma,0') == ComprSpec(name='lzma', spec=0)
assert CompressionSpec('lzma,9') == ComprSpec(name='lzma', spec=9)
with pytest.raises(ValueError):
CompressionSpec('lzma,9,invalid')
with pytest.raises(ValueError):
CompressionSpec('invalid')
def test_chunkerparams():
assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
@ -1242,16 +1224,6 @@ none:*.zip
assert cd.decide('test').name == 'zlib' # no match in conf, use default
def test_compression_decider2():
default = CompressionSpec('zlib')
cd = CompressionDecider2(default)
compr_spec, chunk = cd.decide(Chunk(None))
assert compr_spec.name == 'zlib'
compr_spec, chunk = cd.decide(Chunk(None, compress=CompressionSpec('lzma')))
assert compr_spec.name == 'lzma'
def test_format_line():
data = dict(foo='bar baz')
assert format_line('', data) == ''