mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-26 01:37:20 +00:00
implement compression heuristics based on lz4-compressibility, fixes #1006
also: add some tests that invoke all supported compression algorithms
This commit is contained in:
parent
5ec627bc9b
commit
75b3e786ed
3 changed files with 88 additions and 1 deletions
|
@ -1359,6 +1359,8 @@ def build_parser(self, args=None, prog=None):
|
|||
type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION',
|
||||
help='select compression algorithm (and level):\n'
|
||||
'none == no compression (default),\n'
|
||||
'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
|
||||
' being any valid compression algorithm (and optional level),\n'
|
||||
'lz4 == lz4,\n'
|
||||
'zlib == zlib (default level 6),\n'
|
||||
'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'
|
||||
|
@ -1828,6 +1830,8 @@ def build_parser(self, args=None, prog=None):
|
|||
type=CompressionSpec, default=None, metavar='COMPRESSION',
|
||||
help='select compression algorithm (and level):\n'
|
||||
'none == no compression (default),\n'
|
||||
'auto,C[,L] == built-in heuristic decides between none or C[,L] - with C[,L]\n'
|
||||
' being any valid compression algorithm (and optional level),\n'
|
||||
'lz4 == lz4,\n'
|
||||
'zlib == zlib (default level 6),\n'
|
||||
'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
from . import chunker
|
||||
from .constants import * # NOQA
|
||||
from . import crypto
|
||||
from .compress import COMPR_BUFFER
|
||||
from .compress import COMPR_BUFFER, get_compressor
|
||||
from . import shellpattern
|
||||
import msgpack
|
||||
import msgpack.fallback
|
||||
|
@ -530,6 +530,12 @@ def CompressionSpec(s):
|
|||
else:
|
||||
raise ValueError
|
||||
return dict(name=name, level=level)
|
||||
if name == 'auto':
|
||||
if 2 <= count <= 3:
|
||||
compression = ','.join(values[1:])
|
||||
else:
|
||||
raise ValueError
|
||||
return dict(name=name, spec=CompressionSpec(compression))
|
||||
raise ValueError
|
||||
|
||||
|
||||
|
@ -1497,4 +1503,23 @@ def decide(self, chunk):
|
|||
compr_spec = chunk.meta.get('compress', self.compression)
|
||||
compr_args = dict(buffer=COMPR_BUFFER)
|
||||
compr_args.update(compr_spec)
|
||||
if compr_args['name'] == 'auto':
|
||||
# we did not decide yet, use heuristic:
|
||||
compr_args, chunk = self.heuristic_lz4(compr_args, chunk)
|
||||
return compr_args, chunk
|
||||
|
||||
def heuristic_lz4(self, compr_args, chunk):
|
||||
meta, data = chunk
|
||||
lz4 = get_compressor('lz4', buffer=compr_args['buffer'])
|
||||
cdata = lz4.compress(data)
|
||||
data_len = len(data)
|
||||
cdata_len = len(cdata)
|
||||
if cdata_len < data_len:
|
||||
compr_spec = compr_args['spec']
|
||||
else:
|
||||
# uncompressible - we could have a special "uncompressible compressor"
|
||||
# that marks such data as uncompressible via compression-type metadata.
|
||||
compr_spec = CompressionSpec('none')
|
||||
compr_args.update(compr_spec)
|
||||
logger.debug("len(data) == %d, len(lz4(data)) == %d, choosing %s", data_len, cdata_len, compr_spec)
|
||||
return compr_args, Chunk(data, **meta)
|
||||
|
|
|
@ -1089,6 +1089,64 @@ def test_list_size(self):
|
|||
size, csize, path = output.split("\n")[1].split(" ")
|
||||
assert int(csize) < int(size)
|
||||
|
||||
def _get_sizes(self, compression, compressible, size=10000):
|
||||
if compressible:
|
||||
contents = b'X' * size
|
||||
else:
|
||||
contents = os.urandom(size)
|
||||
self.create_regular_file('file', contents=contents)
|
||||
self.cmd('init', '--encryption=none', self.repository_location)
|
||||
archive = self.repository_location + '::test'
|
||||
self.cmd('create', '-C', compression, archive, 'input')
|
||||
output = self.cmd('list', '--format', '{size} {csize} {path}{NL}', archive)
|
||||
size, csize, path = output.split("\n")[1].split(" ")
|
||||
return int(size), int(csize)
|
||||
|
||||
def test_compression_none_compressible(self):
|
||||
size, csize = self._get_sizes('none', compressible=True)
|
||||
assert csize >= size
|
||||
assert csize == size + 3
|
||||
|
||||
def test_compression_none_uncompressible(self):
|
||||
size, csize = self._get_sizes('none', compressible=False)
|
||||
assert csize >= size
|
||||
assert csize == size + 3
|
||||
|
||||
def test_compression_zlib_compressible(self):
|
||||
size, csize = self._get_sizes('zlib', compressible=True)
|
||||
assert csize < size * 0.1
|
||||
assert csize == 35
|
||||
|
||||
def test_compression_zlib_uncompressible(self):
|
||||
size, csize = self._get_sizes('zlib', compressible=False)
|
||||
assert csize >= size
|
||||
|
||||
def test_compression_auto_compressible(self):
|
||||
size, csize = self._get_sizes('auto,zlib', compressible=True)
|
||||
assert csize < size * 0.1
|
||||
assert csize == 35 # same as compression 'zlib'
|
||||
|
||||
def test_compression_auto_uncompressible(self):
|
||||
size, csize = self._get_sizes('auto,zlib', compressible=False)
|
||||
assert csize >= size
|
||||
assert csize == size + 3 # same as compression 'none'
|
||||
|
||||
def test_compression_lz4_compressible(self):
|
||||
size, csize = self._get_sizes('lz4', compressible=True)
|
||||
assert csize < size * 0.1
|
||||
|
||||
def test_compression_lz4_uncompressible(self):
|
||||
size, csize = self._get_sizes('lz4', compressible=False)
|
||||
assert csize >= size
|
||||
|
||||
def test_compression_lzma_compressible(self):
|
||||
size, csize = self._get_sizes('lzma', compressible=True)
|
||||
assert csize < size * 0.1
|
||||
|
||||
def test_compression_lzma_uncompressible(self):
|
||||
size, csize = self._get_sizes('lzma', compressible=False)
|
||||
assert csize >= size
|
||||
|
||||
def test_break_lock(self):
|
||||
self.cmd('init', self.repository_location)
|
||||
self.cmd('break-lock', self.repository_location)
|
||||
|
|
Loading…
Reference in a new issue