1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-26 01:37:20 +00:00

implement compression heuristics based on lz4-compressibility, fixes #1006

also: add some tests that invoke all supported compression algorithms
This commit is contained in:
Thomas Waldmann 2016-05-02 21:50:59 +02:00
parent 5ec627bc9b
commit 75b3e786ed
3 changed files with 88 additions and 1 deletions

View file

@ -1359,6 +1359,8 @@ def build_parser(self, args=None, prog=None):
type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION',
help='select compression algorithm (and level):\n'
'none == no compression (default),\n'
'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
' being any valid compression algorithm (and optional level),\n'
'lz4 == lz4,\n'
'zlib == zlib (default level 6),\n'
'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'
@ -1828,6 +1830,8 @@ def build_parser(self, args=None, prog=None):
type=CompressionSpec, default=None, metavar='COMPRESSION',
help='select compression algorithm (and level):\n'
'none == no compression (default),\n'
'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
' being any valid compression algorithm (and optional level),\n'
'lz4 == lz4,\n'
'zlib == zlib (default level 6),\n'
'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'

View file

@ -31,7 +31,7 @@
from . import chunker
from .constants import * # NOQA
from . import crypto
from .compress import COMPR_BUFFER
from .compress import COMPR_BUFFER, get_compressor
from . import shellpattern
import msgpack
import msgpack.fallback
@ -530,6 +530,12 @@ def CompressionSpec(s):
else:
raise ValueError
return dict(name=name, level=level)
if name == 'auto':
if 2 <= count <= 3:
compression = ','.join(values[1:])
else:
raise ValueError
return dict(name=name, spec=CompressionSpec(compression))
raise ValueError
@ -1497,4 +1503,23 @@ def decide(self, chunk):
compr_spec = chunk.meta.get('compress', self.compression)
compr_args = dict(buffer=COMPR_BUFFER)
compr_args.update(compr_spec)
if compr_args['name'] == 'auto':
# we did not decide yet, use heuristic:
compr_args, chunk = self.heuristic_lz4(compr_args, chunk)
return compr_args, chunk
def heuristic_lz4(self, compr_args, chunk):
meta, data = chunk
lz4 = get_compressor('lz4', buffer=compr_args['buffer'])
cdata = lz4.compress(data)
data_len = len(data)
cdata_len = len(cdata)
if cdata_len < data_len:
compr_spec = compr_args['spec']
else:
# uncompressible - we could have a special "uncompressible compressor"
# that marks such data as uncompressible via compression-type metadata.
compr_spec = CompressionSpec('none')
compr_args.update(compr_spec)
logger.debug("len(data) == %d, len(lz4(data)) == %d, choosing %s", data_len, cdata_len, compr_spec)
return compr_args, Chunk(data, **meta)

View file

@ -1089,6 +1089,64 @@ def test_list_size(self):
size, csize, path = output.split("\n")[1].split(" ")
assert int(csize) < int(size)
def _get_sizes(self, compression, compressible, size=10000):
if compressible:
contents = b'X' * size
else:
contents = os.urandom(size)
self.create_regular_file('file', contents=contents)
self.cmd('init', '--encryption=none', self.repository_location)
archive = self.repository_location + '::test'
self.cmd('create', '-C', compression, archive, 'input')
output = self.cmd('list', '--format', '{size} {csize} {path}{NL}', archive)
size, csize, path = output.split("\n")[1].split(" ")
return int(size), int(csize)
def test_compression_none_compressible(self):
size, csize = self._get_sizes('none', compressible=True)
assert csize >= size
assert csize == size + 3
def test_compression_none_uncompressible(self):
size, csize = self._get_sizes('none', compressible=False)
assert csize >= size
assert csize == size + 3
def test_compression_zlib_compressible(self):
size, csize = self._get_sizes('zlib', compressible=True)
assert csize < size * 0.1
assert csize == 35
def test_compression_zlib_uncompressible(self):
size, csize = self._get_sizes('zlib', compressible=False)
assert csize >= size
def test_compression_auto_compressible(self):
size, csize = self._get_sizes('auto,zlib', compressible=True)
assert csize < size * 0.1
assert csize == 35 # same as compression 'zlib'
def test_compression_auto_uncompressible(self):
size, csize = self._get_sizes('auto,zlib', compressible=False)
assert csize >= size
assert csize == size + 3 # same as compression 'none'
def test_compression_lz4_compressible(self):
size, csize = self._get_sizes('lz4', compressible=True)
assert csize < size * 0.1
def test_compression_lz4_uncompressible(self):
size, csize = self._get_sizes('lz4', compressible=False)
assert csize >= size
def test_compression_lzma_compressible(self):
size, csize = self._get_sizes('lzma', compressible=True)
assert csize < size * 0.1
def test_compression_lzma_uncompressible(self):
size, csize = self._get_sizes('lzma', compressible=False)
assert csize >= size
def test_break_lock(self):
self.cmd('init', self.repository_location)
self.cmd('break-lock', self.repository_location)