only store compressed data if the result actually is smaller

- add DecidingCompressor class providing generic decide(),
  decide_compress() and compress() suited to work using a typical
  private _decide() implementation that eventually generates compressed
  data as a side-effect of the actual decision process
- the new decide_compress() method returns both the actually used
  compressor (which may be NONE) and the compressed data to allow
  the auto compressor to use that information instead of having to
  figure out whether the LZ4 compressor decided to use NONE or LZ4
  compression
- derive LZ4, LZMA and ZSTD classes from DecidingCompressor and
  have them fall back to NONE if the compressed data turns out bigger
  than the original data
- leave ZLIB as is, as it does not use an ID header and compatibility
  being the only reason to use it instead of above algorithms anyway
- adapt testsuite to cope with incompressible test data not being
  compressed
- add tests to verify that incompressible data is not compressed to
  larger size compressed data
This commit is contained in:
Elmar Hoffmann 2019-07-14 11:39:58 +02:00
parent cf8dac55de
commit d1e730355d
5 changed files with 135 additions and 46 deletions

View File

@ -72,7 +72,8 @@ cdef class CompressorBase:
using Auto compression it needs to determine the _actual_ target compression of a chunk
in order to detect whether it should be recompressed.
For all Compressors that are not Auto this always returns *self*.
Any compressor may return a compressor other than *self*, like e.g. the CNONE compressor,
and should actually do so if *data* would become larger on compression.
"""
return self
@ -95,6 +96,50 @@ cdef class CompressorBase:
# strip ID bytes
return data[2:]
cdef class DecidingCompressor(CompressorBase):
"""
base class for (de)compression classes that (based on an internal _decide
method) decide whether and how to compress data.
"""
name = 'decidebaseclass'
def __init__(self, **kwargs):
super().__init__(**kwargs)
def _decide(self, data):
"""
Decides what to do with *data*. Returns (compressor, compressed_data).
*compressed_data* can be the result of *data* being processed by *compressor*,
if that is generated as a side-effect of the decision process, or None otherwise.
This private method allows for more efficient implementation of compress()
and decide_compress() making use of *compressed_data*, if already generated.
"""
raise NotImplementedError
def decide(self, data):
return self._decide(data)[0]
def decide_compress(self, data):
"""
Decides what to do with *data* and handle accordingly. Returns (compressor, compressed_data).
*compressed_data* is the result of *data* being processed by *compressor*.
"""
compressor, compressed_data = self._decide(data)
if compressed_data is None:
compressed_data = compressor.compress(data)
if compressor is self:
# call super class to add ID bytes
return self, super().compress(compressed_data)
return compressor, compressed_data
def compress(self, data):
return self.decide_compress(data)[1]
class CNONE(CompressorBase):
"""
@ -113,7 +158,7 @@ class CNONE(CompressorBase):
return data
class LZ4(CompressorBase):
class LZ4(DecidingCompressor):
"""
raw LZ4 compression / decompression (liblz4).
@ -128,7 +173,12 @@ class LZ4(CompressorBase):
def __init__(self, **kwargs):
pass
def compress(self, idata):
def _decide(self, idata):
"""
Decides what to do with *data*. Returns (compressor, lz4_data).
*lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
"""
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
cdef int isize = len(idata)
@ -142,7 +192,11 @@ class LZ4(CompressorBase):
osize = LZ4_compress_default(source, dest, isize, osize)
if not osize:
raise Exception('lz4 compress failed')
return super().compress(dest[:osize])
# only compress if the result actually is smaller
if osize < isize:
return self, dest[:osize]
else:
return NONE_COMPRESSOR, None
def decompress(self, idata):
if not isinstance(idata, bytes):
@ -174,7 +228,7 @@ class LZ4(CompressorBase):
return dest[:rsize]
class LZMA(CompressorBase):
class LZMA(DecidingCompressor):
"""
lzma compression / decompression
"""
@ -187,10 +241,18 @@ class LZMA(CompressorBase):
if lzma is None:
raise ValueError('No lzma support found.')
def compress(self, data):
def _decide(self, data):
"""
Decides what to do with *data*. Returns (compressor, lzma_data).
*lzma_data* is the LZMA result if *compressor* is LZMA as well, otherwise it is None.
"""
# we do not need integrity checks in lzma, we do that already
data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
return super().compress(data)
lzma_data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
if len(lzma_data) < len(data):
return self, lzma_data
else:
return NONE_COMPRESSOR, None
def decompress(self, data):
data = super().decompress(data)
@ -200,7 +262,7 @@ class LZMA(CompressorBase):
raise DecompressionError(str(e)) from None
class ZSTD(CompressorBase):
class ZSTD(DecidingCompressor):
"""zstd compression / decompression (pypi: zstandard, gh: python-zstandard)"""
# This is a NOT THREAD SAFE implementation.
# Only ONE python context must to be created at a time.
@ -212,7 +274,12 @@ class ZSTD(CompressorBase):
super().__init__(**kwargs)
self.level = level
def compress(self, idata):
def _decide(self, idata):
"""
Decides what to do with *data*. Returns (compressor, zstd_data).
*zstd_data* is the ZSTD result if *compressor* is ZSTD as well, otherwise it is None.
"""
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
cdef int isize = len(idata)
@ -227,7 +294,11 @@ class ZSTD(CompressorBase):
osize = ZSTD_compress(dest, osize, source, isize, level)
if ZSTD_isError(osize):
raise Exception('zstd compress failed: %s' % ZSTD_getErrorName(osize))
return super().compress(dest[:osize])
# only compress if the result actually is smaller
if osize < isize:
return self, dest[:osize]
else:
return NONE_COMPRESSOR, None
def decompress(self, idata):
if not isinstance(idata, bytes):
@ -306,44 +377,54 @@ class Auto(CompressorBase):
def _decide(self, data):
"""
Decides what to do with *data*. Returns (compressor, lz4_data).
Decides what to do with *data*. Returns (compressor, compressed_data).
*lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
*compressor* is the compressor that is decided to be best suited to compress
*data*, *compressed_data* is the result of *data* being compressed by a
compressor, which may or may not be *compressor*!
There are three possible outcomes of the decision process:
* *data* compresses well enough for trying the more expensive compressor
set on instantiation to make sense.
In this case, (expensive_compressor_class, lz4_compressed_data) is
returned.
* *data* compresses only slightly using the LZ4 compressor, thus trying
the more expensive compressor for potentially little gain does not
make sense.
In this case, (LZ4_COMPRESSOR, lz4_compressed_data) is returned.
* *data* does not compress at all using LZ4, in this case
(NONE_COMPRESSOR, none_compressed_data) is returned.
Note: While it makes no sense, the expensive compressor may well be set
to the LZ4 compressor.
"""
lz4_data = LZ4_COMPRESSOR.compress(data)
# lz4_data includes the compression type header, while data does not yet
ratio = len(lz4_data) / (len(data) + 2)
compressor, compressed_data = LZ4_COMPRESSOR.decide_compress(data)
# compressed_data includes the compression type header, while data does not yet
ratio = len(compressed_data) / (len(data) + 2)
if ratio < 0.97:
return self.compressor, lz4_data
elif ratio < 1:
return LZ4_COMPRESSOR, lz4_data
return self.compressor, compressed_data
else:
return NONE_COMPRESSOR, None
return compressor, compressed_data
def decide(self, data):
return self._decide(data)[0]
def compress(self, data):
compressor, lz4_data = self._decide(data)
if compressor is LZ4_COMPRESSOR:
compressor, cheap_compressed_data = self._decide(data)
if compressor in (LZ4_COMPRESSOR, NONE_COMPRESSOR):
# we know that trying to compress with expensive compressor is likely pointless,
# but lz4 managed to at least squeeze the data a bit.
return lz4_data
if compressor is NONE_COMPRESSOR:
# we know that trying to compress with expensive compressor is likely pointless
# and also lz4 did not manage to squeeze the data (not even a bit).
uncompressed_data = compressor.compress(data)
return uncompressed_data
# so we fallback to return the cheap compressed data.
return cheap_compressed_data
# if we get here, the decider decided to try the expensive compressor.
# we also know that lz4_data is smaller than uncompressed data.
exp_compressed_data = compressor.compress(data)
ratio = len(exp_compressed_data) / len(lz4_data)
# we also know that the compressed data returned by the decider is lz4 compressed.
expensive_compressed_data = compressor.compress(data)
ratio = len(expensive_compressed_data) / len(cheap_compressed_data)
if ratio < 0.99:
# the expensive compressor managed to squeeze the data significantly better than lz4.
return exp_compressed_data
return expensive_compressed_data
else:
# otherwise let's just store the lz4 data, which decompresses extremely fast.
return lz4_data
return cheap_compressed_data
def decompress(self, data):
raise NotImplementedError

View File

@ -16,7 +16,7 @@ import traceback
from subprocess import Popen, PIPE
from . import __version__
from .compress import LZ4
from .compress import Compressor
from .constants import * # NOQA
from .helpers import Error, IntegrityError
from .helpers import bin_to_hex
@ -1215,7 +1215,7 @@ def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None
key = decrypted_cache
# 32 bit csize, 64 bit (8 byte) xxh64
cache_struct = struct.Struct('=I8s')
compressor = LZ4()
compressor = Compressor('lz4')
def pack(data):
csize, decrypted = data

View File

@ -2140,7 +2140,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
def test_compression_lz4_uncompressible(self):
size, csize = self._get_sizes('lz4', compressible=False)
assert csize >= size
assert csize == size + 3 # same as compression 'none'
def test_compression_lzma_compressible(self):
size, csize = self._get_sizes('lzma', compressible=True)
@ -2148,7 +2148,15 @@ class ArchiverTestCase(ArchiverTestCaseBase):
def test_compression_lzma_uncompressible(self):
size, csize = self._get_sizes('lzma', compressible=False)
assert csize >= size
assert csize == size + 3 # same as compression 'none'
def test_compression_zstd_compressible(self):
size, csize = self._get_sizes('zstd', compressible=True)
assert csize < size * 0.1
def test_compression_zstd_uncompressible(self):
size, csize = self._get_sizes('zstd', compressible=False)
assert csize == size + 3 # same as compression 'none'
def test_change_passphrase(self):
self.cmd('init', '--encryption=repokey', self.repository_location)

View File

@ -43,11 +43,13 @@ def test_lz4():
assert data == Compressor(**params).decompress(cdata) # autodetect
def test_lz4_buffer_allocation():
def test_lz4_buffer_allocation(monkeypatch):
# disable fallback to no compression on incompressible data
monkeypatch.setattr(LZ4, 'decide', lambda always_compress: LZ4)
# test with a rather huge data object to see if buffer allocation / resizing works
data = os.urandom(5 * 2**20) * 10 # 50MiB badly compressible data
assert len(data) == 50 * 2**20
c = get_compressor(name='lz4')
c = Compressor('lz4')
cdata = c.compress(data)
assert len(cdata) > len(data)
assert data == c.decompress(cdata)

View File

@ -264,9 +264,8 @@ class TestKey:
assert len(key.id_key) == 32
plaintext = b'123456789'
authenticated = key.encrypt(plaintext)
# 0x07 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
# block of length nine follows (the plaintext).
assert authenticated == b'\x07\x01\x00\x90' + plaintext
# 0x07 is the key TYPE, \x0000 identifies no compression.
assert authenticated == b'\x07\x00\x00' + plaintext
def test_blake2_authenticated_encrypt(self, monkeypatch):
monkeypatch.setenv('BORG_PASSPHRASE', 'test')
@ -275,9 +274,8 @@ class TestKey:
assert len(key.id_key) == 128
plaintext = b'123456789'
authenticated = key.encrypt(plaintext)
# 0x06 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
# block of length nine follows (the plaintext).
assert authenticated == b'\x06\x01\x00\x90' + plaintext
# 0x06 is the key TYPE, 0x0000 identifies no compression.
assert authenticated == b'\x06\x00\x00' + plaintext
class TestPassphrase: