From 75b3e786ed1b28662095e8b6a96fd9b198532f6f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 2 May 2016 21:50:59 +0200 Subject: [PATCH] implement compression heuristics based on lz4-compressibility, fixes #1006 also: add some tests that invoke all supported compression algorithms --- borg/archiver.py | 4 +++ borg/helpers.py | 27 +++++++++++++++++- borg/testsuite/archiver.py | 58 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/borg/archiver.py b/borg/archiver.py index 949d8fbf7..e99f8e8d6 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -1359,6 +1359,8 @@ def build_parser(self, args=None, prog=None): type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION', help='select compression algorithm (and level):\n' 'none == no compression (default),\n' + 'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n' + ' being any valid compression algorithm (and optional level),\n' 'lz4 == lz4,\n' 'zlib == zlib (default level 6),\n' 'zlib,0 .. zlib,9 == zlib (with level 0..9),\n' @@ -1828,6 +1830,8 @@ def build_parser(self, args=None, prog=None): type=CompressionSpec, default=None, metavar='COMPRESSION', help='select compression algorithm (and level):\n' 'none == no compression (default),\n' + 'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n' + ' being any valid compression algorithm (and optional level),\n' 'lz4 == lz4,\n' 'zlib == zlib (default level 6),\n' 'zlib,0 .. zlib,9 == zlib (with level 0..9),\n' diff --git a/borg/helpers.py b/borg/helpers.py index 395e078e0..999da93c5 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -31,7 +31,7 @@ from . import chunker from .constants import * # NOQA from . import crypto -from .compress import COMPR_BUFFER +from .compress import COMPR_BUFFER, get_compressor from . import shellpattern import msgpack import msgpack.fallback @@ -530,6 +530,12 @@ def CompressionSpec(s): else: raise ValueError return dict(name=name, level=level) + if name == 'auto': + if 2 <= count <= 3: + compression = ','.join(values[1:]) + else: + raise ValueError + return dict(name=name, spec=CompressionSpec(compression)) raise ValueError @@ -1497,4 +1503,23 @@ def decide(self, chunk): compr_spec = chunk.meta.get('compress', self.compression) compr_args = dict(buffer=COMPR_BUFFER) compr_args.update(compr_spec) + if compr_args['name'] == 'auto': + # we did not decide yet, use heuristic: + compr_args, chunk = self.heuristic_lz4(compr_args, chunk) return compr_args, chunk + + def heuristic_lz4(self, compr_args, chunk): + meta, data = chunk + lz4 = get_compressor('lz4', buffer=compr_args['buffer']) + cdata = lz4.compress(data) + data_len = len(data) + cdata_len = len(cdata) + if cdata_len < data_len: + compr_spec = compr_args['spec'] + else: + # uncompressible - we could have a special "uncompressible compressor" + # that marks such data as uncompressible via compression-type metadata. + compr_spec = CompressionSpec('none') + compr_args.update(compr_spec) + logger.debug("len(data) == %d, len(lz4(data)) == %d, choosing %s", data_len, cdata_len, compr_spec) + return compr_args, Chunk(data, **meta) diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 73632ee95..f5d0d0305 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -1089,6 +1089,64 @@ def test_list_size(self): size, csize, path = output.split("\n")[1].split(" ") assert int(csize) < int(size) + def _get_sizes(self, compression, compressible, size=10000): + if compressible: + contents = b'X' * size + else: + contents = os.urandom(size) + self.create_regular_file('file', contents=contents) + self.cmd('init', '--encryption=none', self.repository_location) + archive = self.repository_location + '::test' + self.cmd('create', '-C', compression, archive, 'input') + output = self.cmd('list', '--format', '{size} {csize} {path}{NL}', archive) + size, csize, path = output.split("\n")[1].split(" ") + return int(size), int(csize) + + def test_compression_none_compressible(self): + size, csize = self._get_sizes('none', compressible=True) + assert csize >= size + assert csize == size + 3 + + def test_compression_none_uncompressible(self): + size, csize = self._get_sizes('none', compressible=False) + assert csize >= size + assert csize == size + 3 + + def test_compression_zlib_compressible(self): + size, csize = self._get_sizes('zlib', compressible=True) + assert csize < size * 0.1 + assert csize == 35 + + def test_compression_zlib_uncompressible(self): + size, csize = self._get_sizes('zlib', compressible=False) + assert csize >= size + + def test_compression_auto_compressible(self): + size, csize = self._get_sizes('auto,zlib', compressible=True) + assert csize < size * 0.1 + assert csize == 35 # same as compression 'zlib' + + def test_compression_auto_uncompressible(self): + size, csize = self._get_sizes('auto,zlib', compressible=False) + assert csize >= size + assert csize == size + 3 # same as compression 'none' + + def test_compression_lz4_compressible(self): + size, csize = self._get_sizes('lz4', compressible=True) + assert csize < size * 0.1 + + def test_compression_lz4_uncompressible(self): + size, csize = self._get_sizes('lz4', compressible=False) + assert csize >= size + + def test_compression_lzma_compressible(self): + size, csize = self._get_sizes('lzma', compressible=True) + assert csize < size * 0.1 + + def test_compression_lzma_uncompressible(self): + size, csize = self._get_sizes('lzma', compressible=False) + assert csize >= size + def test_break_lock(self): self.cmd('init', self.repository_location) self.cmd('break-lock', self.repository_location)