mirror of https://github.com/borgbackup/borg.git
refactor CompressionDecider2 into a meta Compressor
This commit is contained in:
parent
d79da81d22
commit
a27f585eaa
|
@ -21,7 +21,7 @@ logger = create_logger()
|
|||
from . import xattr
|
||||
from .cache import ChunkListEntry
|
||||
from .chunker import Chunker
|
||||
from .compress import Compressor
|
||||
from .compress import Compressor, CompressionSpec
|
||||
from .constants import * # NOQA
|
||||
from .hashindex import ChunkIndex, ChunkIndexEntry
|
||||
from .helpers import Manifest
|
||||
|
@ -36,7 +36,7 @@ from .helpers import bin_to_hex
|
|||
from .helpers import safe_ns
|
||||
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
|
||||
from .helpers import PathPrefixPattern, FnmatchPattern
|
||||
from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec
|
||||
from .helpers import CompressionDecider1
|
||||
from .item import Item, ArchiveItem
|
||||
from .key import key_factory
|
||||
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
|
||||
|
@ -312,7 +312,6 @@ class Archive:
|
|||
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
||||
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
||||
compression_files or [])
|
||||
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
||||
if name in manifest.archives:
|
||||
raise self.AlreadyExists(name)
|
||||
self.last_checkpoint = time.monotonic()
|
||||
|
@ -1585,7 +1584,6 @@ class ArchiveRecreater:
|
|||
self.seen_chunks = set()
|
||||
self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
|
||||
compression_files or [])
|
||||
key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none'))
|
||||
|
||||
self.dry_run = dry_run
|
||||
self.stats = stats
|
||||
|
@ -1663,12 +1661,11 @@ class ArchiveRecreater:
|
|||
if chunk_id in self.seen_chunks:
|
||||
return self.cache.chunk_incref(chunk_id, target.stats)
|
||||
chunk = Chunk(data, compress=compress)
|
||||
compression_spec, chunk = self.key.compression_decider2.decide(chunk)
|
||||
overwrite = self.recompress
|
||||
if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
|
||||
# Check if this chunk is already compressed the way we want it
|
||||
old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
|
||||
if Compressor.detect(old_chunk.data).name == compression_spec.name:
|
||||
if Compressor.detect(old_chunk.data).name == compress.name:
|
||||
# Stored chunk has the same compression we wanted
|
||||
overwrite = False
|
||||
chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False)
|
||||
|
|
|
@ -34,10 +34,11 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_s
|
|||
from .archive import BackupOSError, backup_io
|
||||
from .cache import Cache
|
||||
from .constants import * # NOQA
|
||||
from .compress import CompressionSpec
|
||||
from .crc32 import crc32
|
||||
from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR
|
||||
from .helpers import Error, NoManifestError, set_ec
|
||||
from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec, ComprSpec
|
||||
from .helpers import location_validator, archivename_validator, ChunkerParams
|
||||
from .helpers import PrefixSpec, SortBySpec, HUMAN_SORT_KEYS
|
||||
from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter
|
||||
from .helpers import format_time, format_timedelta, format_file_size, format_archive
|
||||
|
@ -107,6 +108,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl
|
|||
with repository:
|
||||
if manifest or cache:
|
||||
kwargs['manifest'], kwargs['key'] = Manifest.load(repository)
|
||||
if args.__dict__.get('compression'):
|
||||
kwargs['key'].compressor = args.compression.compressor
|
||||
if cache:
|
||||
with Cache(repository, kwargs['key'], kwargs['manifest'],
|
||||
do_files=getattr(args, 'cache_files', False),
|
||||
|
@ -2411,7 +2414,7 @@ class Archiver:
|
|||
help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, '
|
||||
'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
archive_group.add_argument('-C', '--compression', dest='compression',
|
||||
type=CompressionSpec, default=ComprSpec(name='lz4', spec=None), metavar='COMPRESSION',
|
||||
type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION',
|
||||
help='select compression algorithm, see the output of the '
|
||||
'"borg help compression" command for details.')
|
||||
archive_group.add_argument('--compression-from', dest='compression_files',
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
import zlib
|
||||
from collections import namedtuple
|
||||
|
||||
try:
|
||||
import lzma
|
||||
except ImportError:
|
||||
lzma = None
|
||||
|
||||
from .logger import create_logger
|
||||
from .helpers import Buffer, DecompressionError
|
||||
|
||||
API_VERSION = '1.1_02'
|
||||
|
@ -179,12 +182,50 @@ class ZLIB(CompressorBase):
|
|||
raise DecompressionError(str(e)) from None
|
||||
|
||||
|
||||
class Auto(CompressorBase):
|
||||
"""
|
||||
Meta-Compressor that decides which compression to use based on LZ4's ratio.
|
||||
|
||||
As a meta-Compressor the actual compression is deferred to other Compressors,
|
||||
therefore this Compressor has no ID, no detect() and no decompress().
|
||||
"""
|
||||
|
||||
ID = None
|
||||
name = 'auto'
|
||||
|
||||
logger = create_logger('borg.debug.file-compression')
|
||||
|
||||
def __init__(self, compressor):
|
||||
super().__init__()
|
||||
self.compressor = compressor
|
||||
self.lz4 = get_compressor('lz4')
|
||||
self.none = get_compressor('none')
|
||||
|
||||
def compress(self, data):
|
||||
lz4_data = self.lz4.compress(data)
|
||||
if len(lz4_data) < 0.97 * len(data):
|
||||
return self.compressor.compress(data)
|
||||
elif len(lz4_data) < len(data):
|
||||
return lz4_data
|
||||
else:
|
||||
return self.none.compress(data)
|
||||
|
||||
def decompress(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
def detect(cls, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
# Maps valid compressor names to their class
|
||||
COMPRESSOR_TABLE = {
|
||||
CNONE.name: CNONE,
|
||||
LZ4.name: LZ4,
|
||||
ZLIB.name: ZLIB,
|
||||
LZMA.name: LZMA,
|
||||
Auto.name: Auto,
|
||||
}
|
||||
# List of possible compression types. Does not include Auto, since it is a meta-Compressor.
|
||||
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
|
||||
|
||||
def get_compressor(name, **kwargs):
|
||||
|
@ -216,3 +257,37 @@ class Compressor:
|
|||
return cls
|
||||
else:
|
||||
raise ValueError('No decompressor for this data found: %r.', data[:2])
|
||||
|
||||
|
||||
ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor'))
|
||||
|
||||
|
||||
def CompressionSpec(s):
|
||||
values = s.split(',')
|
||||
count = len(values)
|
||||
if count < 1:
|
||||
raise ValueError
|
||||
# --compression algo[,level]
|
||||
name = values[0]
|
||||
if name == 'none':
|
||||
return ComprSpec(name=name, spec=None, compressor=CNONE())
|
||||
elif name == 'lz4':
|
||||
return ComprSpec(name=name, spec=None, compressor=LZ4())
|
||||
if name in ('zlib', 'lzma', ):
|
||||
if count < 2:
|
||||
level = 6 # default compression level in py stdlib
|
||||
elif count == 2:
|
||||
level = int(values[1])
|
||||
if not 0 <= level <= 9:
|
||||
raise ValueError
|
||||
else:
|
||||
raise ValueError
|
||||
return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level))
|
||||
if name == 'auto':
|
||||
if 2 <= count <= 3:
|
||||
compression = ','.join(values[1:])
|
||||
else:
|
||||
raise ValueError
|
||||
inner = CompressionSpec(compression)
|
||||
return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor))
|
||||
raise ValueError
|
||||
|
|
|
@ -726,37 +726,6 @@ def ChunkerParams(s):
|
|||
return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
|
||||
|
||||
|
||||
ComprSpec = namedtuple('ComprSpec', ('name', 'spec'))
|
||||
|
||||
|
||||
def CompressionSpec(s):
|
||||
values = s.split(',')
|
||||
count = len(values)
|
||||
if count < 1:
|
||||
raise ValueError
|
||||
# --compression algo[,level]
|
||||
name = values[0]
|
||||
if name in ('none', 'lz4', ):
|
||||
return ComprSpec(name=name, spec=None)
|
||||
if name in ('zlib', 'lzma', ):
|
||||
if count < 2:
|
||||
level = 6 # default compression level in py stdlib
|
||||
elif count == 2:
|
||||
level = int(values[1])
|
||||
if not 0 <= level <= 9:
|
||||
raise ValueError
|
||||
else:
|
||||
raise ValueError
|
||||
return ComprSpec(name=name, spec=level)
|
||||
if name == 'auto':
|
||||
if 2 <= count <= 3:
|
||||
compression = ','.join(values[1:])
|
||||
else:
|
||||
raise ValueError
|
||||
return ComprSpec(name=name, spec=CompressionSpec(compression))
|
||||
raise ValueError
|
||||
|
||||
|
||||
def dir_is_cachedir(path):
|
||||
"""Determines whether the specified path is a cache directory (and
|
||||
therefore should potentially be excluded from the backup) according to
|
||||
|
@ -2136,11 +2105,12 @@ class CompressionDecider1:
|
|||
:param compression_files: list of compression config files (e.g. from --compression-from) or
|
||||
a list of other line iterators
|
||||
"""
|
||||
self.compression = compression
|
||||
from .compress import CompressionSpec
|
||||
self.compressor = compression.compressor
|
||||
if not compression_files:
|
||||
self.matcher = None
|
||||
else:
|
||||
self.matcher = PatternMatcher(fallback=compression)
|
||||
self.matcher = PatternMatcher(fallback=compression.compressor)
|
||||
for file in compression_files:
|
||||
try:
|
||||
for line in clean_lines(file):
|
||||
|
@ -2148,7 +2118,7 @@ class CompressionDecider1:
|
|||
compr_spec, fn_pattern = line.split(':', 1)
|
||||
except:
|
||||
continue
|
||||
self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec))
|
||||
self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec).compressor)
|
||||
finally:
|
||||
if hasattr(file, 'close'):
|
||||
file.close()
|
||||
|
@ -2156,42 +2126,7 @@ class CompressionDecider1:
|
|||
def decide(self, path):
|
||||
if self.matcher is not None:
|
||||
return self.matcher.match(path)
|
||||
return self.compression
|
||||
|
||||
|
||||
class CompressionDecider2:
|
||||
logger = create_logger('borg.debug.file-compression')
|
||||
|
||||
def __init__(self, compression):
|
||||
self.compression = compression
|
||||
|
||||
def decide(self, chunk):
|
||||
# nothing fancy here yet: we either use what the metadata says or the default
|
||||
# later, we can decide based on the chunk data also.
|
||||
# if we compress the data here to decide, we can even update the chunk data
|
||||
# and modify the metadata as desired.
|
||||
compr_spec = chunk.meta.get('compress', self.compression)
|
||||
if compr_spec.name == 'auto':
|
||||
# we did not decide yet, use heuristic:
|
||||
compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk)
|
||||
return compr_spec, chunk
|
||||
|
||||
def heuristic_lz4(self, compr_args, chunk):
|
||||
from .compress import get_compressor
|
||||
meta, data = chunk
|
||||
lz4 = get_compressor('lz4')
|
||||
cdata = lz4.compress(data)
|
||||
data_len = len(data)
|
||||
cdata_len = len(cdata)
|
||||
if cdata_len < 0.97 * data_len:
|
||||
compr_spec = compr_args.spec
|
||||
else:
|
||||
# uncompressible - we could have a special "uncompressible compressor"
|
||||
# that marks such data as uncompressible via compression-type metadata.
|
||||
compr_spec = CompressionSpec('none')
|
||||
self.logger.debug("len(data) == %d, len(lz4(data)) == %d, ratio == %.3f, choosing %s", data_len, cdata_len, cdata_len/data_len, compr_spec)
|
||||
meta['compress'] = compr_spec
|
||||
return compr_spec, Chunk(data, **meta)
|
||||
return self.compressor
|
||||
|
||||
|
||||
class ErrorIgnoringTextIOWrapper(io.TextIOWrapper):
|
||||
|
|
|
@ -13,14 +13,13 @@ from .logger import create_logger
|
|||
logger = create_logger()
|
||||
|
||||
from .constants import * # NOQA
|
||||
from .compress import Compressor, get_compressor
|
||||
from .compress import Compressor
|
||||
from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256, hkdf_hmac_sha512
|
||||
from .helpers import Chunk, StableDict
|
||||
from .helpers import Error, IntegrityError
|
||||
from .helpers import yes
|
||||
from .helpers import get_keys_dir, get_security_dir
|
||||
from .helpers import bin_to_hex
|
||||
from .helpers import CompressionDecider2, CompressionSpec
|
||||
from .item import Key, EncryptedKey
|
||||
from .platform import SaveFile
|
||||
from .nonces import NonceManager
|
||||
|
@ -143,8 +142,8 @@ class KeyBase:
|
|||
self.TYPE_STR = bytes([self.TYPE])
|
||||
self.repository = repository
|
||||
self.target = None # key location file path / repo obj
|
||||
self.compression_decider2 = CompressionDecider2(CompressionSpec('none'))
|
||||
self.compressor = Compressor('none') # for decompression
|
||||
self.decompress = self.compressor.decompress
|
||||
self.tam_required = True
|
||||
|
||||
def id_hash(self, data):
|
||||
|
@ -152,10 +151,8 @@ class KeyBase:
|
|||
"""
|
||||
|
||||
def compress(self, chunk):
|
||||
compr_args, chunk = self.compression_decider2.decide(chunk)
|
||||
compressor = Compressor(name=compr_args.name, level=compr_args.spec)
|
||||
meta, data = chunk
|
||||
data = compressor.compress(data)
|
||||
data = meta.get('compress', self.compressor).compress(data)
|
||||
return Chunk(data, **meta)
|
||||
|
||||
def encrypt(self, chunk):
|
||||
|
@ -268,7 +265,7 @@ class PlaintextKey(KeyBase):
|
|||
payload = memoryview(data)[1:]
|
||||
if not decompress:
|
||||
return Chunk(payload)
|
||||
data = self.compressor.decompress(payload)
|
||||
data = self.decompress(payload)
|
||||
self.assert_id(id, data)
|
||||
return Chunk(data)
|
||||
|
||||
|
@ -362,7 +359,7 @@ class AESKeyBase(KeyBase):
|
|||
payload = self.dec_cipher.decrypt(data_view[41:])
|
||||
if not decompress:
|
||||
return Chunk(payload)
|
||||
data = self.compressor.decompress(payload)
|
||||
data = self.decompress(payload)
|
||||
self.assert_id(id, data)
|
||||
return Chunk(data)
|
||||
|
||||
|
@ -757,7 +754,7 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey):
|
|||
payload = memoryview(data)[1:]
|
||||
if not decompress:
|
||||
return Chunk(payload)
|
||||
data = self.compressor.decompress(payload)
|
||||
data = self.decompress(payload)
|
||||
self.assert_id(id, data)
|
||||
return Chunk(data)
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ except ImportError:
|
|||
|
||||
import pytest
|
||||
|
||||
from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4
|
||||
from ..compress import get_compressor, Compressor, CompressionSpec, ComprSpec, CNONE, ZLIB, LZ4, LZMA, Auto
|
||||
|
||||
|
||||
buffer = bytes(2**16)
|
||||
|
@ -107,3 +107,48 @@ def test_compressor():
|
|||
for params in params_list:
|
||||
c = Compressor(**params)
|
||||
assert data == c.decompress(c.compress(data))
|
||||
|
||||
|
||||
def test_auto():
|
||||
compressor = CompressionSpec('auto,zlib,9').compressor
|
||||
|
||||
compressed = compressor.compress(bytes(500))
|
||||
assert Compressor.detect(compressed) == ZLIB
|
||||
|
||||
compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~')
|
||||
assert Compressor.detect(compressed) == CNONE
|
||||
|
||||
|
||||
def test_compression_specs():
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('')
|
||||
|
||||
assert isinstance(CompressionSpec('none').compressor, CNONE)
|
||||
assert isinstance(CompressionSpec('lz4').compressor, LZ4)
|
||||
|
||||
zlib = CompressionSpec('zlib').compressor
|
||||
assert isinstance(zlib, ZLIB)
|
||||
assert zlib.level == 6
|
||||
zlib = CompressionSpec('zlib,0').compressor
|
||||
assert isinstance(zlib, ZLIB)
|
||||
assert zlib.level == 0
|
||||
zlib = CompressionSpec('zlib,9').compressor
|
||||
assert isinstance(zlib, ZLIB)
|
||||
assert zlib.level == 9
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('zlib,9,invalid')
|
||||
|
||||
lzma = CompressionSpec('lzma').compressor
|
||||
assert isinstance(lzma, LZMA)
|
||||
assert lzma.level == 6
|
||||
lzma = CompressionSpec('lzma,0').compressor
|
||||
assert isinstance(lzma, LZMA)
|
||||
assert lzma.level == 0
|
||||
lzma = CompressionSpec('lzma,9').compressor
|
||||
assert isinstance(lzma, LZMA)
|
||||
assert lzma.level == 9
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('lzma,9,invalid')
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('invalid')
|
||||
|
|
|
@ -12,6 +12,7 @@ import msgpack
|
|||
import msgpack.fallback
|
||||
|
||||
from .. import platform
|
||||
from ..compress import CompressionSpec
|
||||
from ..helpers import Location
|
||||
from ..helpers import Buffer
|
||||
from ..helpers import partial_format, format_file_size, parse_file_size, format_timedelta, format_line, PlaceholderError, replace_placeholders
|
||||
|
@ -24,7 +25,7 @@ from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex
|
|||
from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk
|
||||
from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless
|
||||
from ..helpers import load_exclude_file, load_pattern_file
|
||||
from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2
|
||||
from ..helpers import CompressionDecider1
|
||||
from ..helpers import parse_pattern, PatternMatcher
|
||||
from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern
|
||||
from ..helpers import swidth_slice
|
||||
|
@ -698,25 +699,6 @@ def test_pattern_matcher():
|
|||
assert PatternMatcher(fallback="hey!").fallback == "hey!"
|
||||
|
||||
|
||||
def test_compression_specs():
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('')
|
||||
assert CompressionSpec('none') == ComprSpec(name='none', spec=None)
|
||||
assert CompressionSpec('lz4') == ComprSpec(name='lz4', spec=None)
|
||||
assert CompressionSpec('zlib') == ComprSpec(name='zlib', spec=6)
|
||||
assert CompressionSpec('zlib,0') == ComprSpec(name='zlib', spec=0)
|
||||
assert CompressionSpec('zlib,9') == ComprSpec(name='zlib', spec=9)
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('zlib,9,invalid')
|
||||
assert CompressionSpec('lzma') == ComprSpec(name='lzma', spec=6)
|
||||
assert CompressionSpec('lzma,0') == ComprSpec(name='lzma', spec=0)
|
||||
assert CompressionSpec('lzma,9') == ComprSpec(name='lzma', spec=9)
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('lzma,9,invalid')
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('invalid')
|
||||
|
||||
|
||||
def test_chunkerparams():
|
||||
assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095)
|
||||
assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095)
|
||||
|
@ -1242,16 +1224,6 @@ none:*.zip
|
|||
assert cd.decide('test').name == 'zlib' # no match in conf, use default
|
||||
|
||||
|
||||
def test_compression_decider2():
|
||||
default = CompressionSpec('zlib')
|
||||
|
||||
cd = CompressionDecider2(default)
|
||||
compr_spec, chunk = cd.decide(Chunk(None))
|
||||
assert compr_spec.name == 'zlib'
|
||||
compr_spec, chunk = cd.decide(Chunk(None, compress=CompressionSpec('lzma')))
|
||||
assert compr_spec.name == 'lzma'
|
||||
|
||||
|
||||
def test_format_line():
|
||||
data = dict(foo='bar baz')
|
||||
assert format_line('', data) == ''
|
||||
|
|
Loading…
Reference in New Issue