only store compressed data if the result actually is smaller

- add DecidingCompressor class providing generic decide(), decide_compress() and compress() suited to work using a typical private _decide() implementation that eventually generates compressed data as a side-effect of the actual decision process - the new decide_compress() method returns both the actually used compressor (which may be NONE) and the compressed data to allow the auto compressor to use that information instead of having to figure out whether the LZ4 compressor decided to use NONE or LZ4 compression - derive LZ4, LZMA and ZSTD classes from DecidingCompressor and have them fall back to NONE if the compressed data turns out bigger than the original data - leave ZLIB as is, as it does not use an ID header and compatibility being the only reason to use it instead of above algorithms anyway - adapt testsuite to cope with incompressible test data not being compressed - add tests to verify that incompressible data is not compressed to larger size compressed data
2019-07-14 11:39:58 +02:00 · 2019-07-14 11:39:58 +02:00 · d1e730355d
parent cf8dac55de
commit d1e730355d
5 changed files with 135 additions and 46 deletions
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@ -72,7 +72,8 @@ cdef class CompressorBase:
        using Auto compression it needs to determine the _actual_ target compression of a chunk
        in order to detect whether it should be recompressed.

-        For all Compressors that are not Auto this always returns *self*.
+        Any compressor may return a compressor other than *self*, like e.g. the CNONE compressor,
+        and should actually do so if *data* would become larger on compression.
        """
        return self

@ -95,6 +96,50 @@ cdef class CompressorBase:
        # strip ID bytes
        return data[2:]

+cdef class DecidingCompressor(CompressorBase):
+    """
+    base class for (de)compression classes that (based on an internal _decide
+    method) decide whether and how to compress data.
+    """
+    name = 'decidebaseclass'
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, compressed_data).
+
+        *compressed_data* can be the result of *data* being processed by *compressor*,
+        if that is generated as a side-effect of the decision process, or None otherwise.
+
+        This private method allows for more efficient implementation of compress()
+        and decide_compress() making use of *compressed_data*, if already generated.
+        """
+        raise NotImplementedError
+
+    def decide(self, data):
+        return self._decide(data)[0]
+
+    def decide_compress(self, data):
+        """
+        Decides what to do with *data* and handle accordingly. Returns (compressor, compressed_data).
+
+        *compressed_data* is the result of *data* being processed by *compressor*.
+        """
+        compressor, compressed_data = self._decide(data)
+
+        if compressed_data is None:
+            compressed_data = compressor.compress(data)
+
+        if compressor is self:
+            # call super class to add ID bytes
+            return self, super().compress(compressed_data)
+
+        return compressor, compressed_data
+
+    def compress(self, data):
+        return self.decide_compress(data)[1]

 class CNONE(CompressorBase):
    """
@ -113,7 +158,7 @@ class CNONE(CompressorBase):
        return data


-class LZ4(CompressorBase):
+class LZ4(DecidingCompressor):
    """
    raw LZ4 compression / decompression (liblz4).

@ -128,7 +173,12 @@ class LZ4(CompressorBase):
    def __init__(self, **kwargs):
        pass

-    def compress(self, idata):
+    def _decide(self, idata):
+        """
+        Decides what to do with *data*. Returns (compressor, lz4_data).
+
+        *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
+        """
        if not isinstance(idata, bytes):
            idata = bytes(idata)  # code below does not work with memoryview
        cdef int isize = len(idata)
@ -142,7 +192,11 @@ class LZ4(CompressorBase):
            osize = LZ4_compress_default(source, dest, isize, osize)
        if not osize:
            raise Exception('lz4 compress failed')
-        return super().compress(dest[:osize])
+        # only compress if the result actually is smaller
+        if osize < isize:
+            return self, dest[:osize]
+        else:
+            return NONE_COMPRESSOR, None

    def decompress(self, idata):
        if not isinstance(idata, bytes):
@ -174,7 +228,7 @@ class LZ4(CompressorBase):
        return dest[:rsize]


-class LZMA(CompressorBase):
+class LZMA(DecidingCompressor):
    """
    lzma compression / decompression
    """
@ -187,10 +241,18 @@ class LZMA(CompressorBase):
        if lzma is None:
            raise ValueError('No lzma support found.')

-    def compress(self, data):
+    def _decide(self, data):
+        """
+        Decides what to do with *data*. Returns (compressor, lzma_data).
+
+        *lzma_data* is the LZMA result if *compressor* is LZMA as well, otherwise it is None.
+        """
        # we do not need integrity checks in lzma, we do that already
-        data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
-        return super().compress(data)
+        lzma_data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
+        if len(lzma_data) < len(data):
+            return self, lzma_data
+        else:
+            return NONE_COMPRESSOR, None

    def decompress(self, data):
        data = super().decompress(data)
@ -200,7 +262,7 @@ class LZMA(CompressorBase):
            raise DecompressionError(str(e)) from None


-class ZSTD(CompressorBase):
+class ZSTD(DecidingCompressor):
    """zstd compression / decompression (pypi: zstandard, gh: python-zstandard)"""
    # This is a NOT THREAD SAFE implementation.
    # Only ONE python context must to be created at a time.
@ -212,7 +274,12 @@ class ZSTD(CompressorBase):
        super().__init__(**kwargs)
        self.level = level

-    def compress(self, idata):
+    def _decide(self, idata):
+        """
+        Decides what to do with *data*. Returns (compressor, zstd_data).
+
+        *zstd_data* is the ZSTD result if *compressor* is ZSTD as well, otherwise it is None.
+        """
        if not isinstance(idata, bytes):
            idata = bytes(idata)  # code below does not work with memoryview
        cdef int isize = len(idata)
@ -227,7 +294,11 @@ class ZSTD(CompressorBase):
            osize = ZSTD_compress(dest, osize, source, isize, level)
        if ZSTD_isError(osize):
            raise Exception('zstd compress failed: %s' % ZSTD_getErrorName(osize))
-        return super().compress(dest[:osize])
+        # only compress if the result actually is smaller
+        if osize < isize:
+            return self, dest[:osize]
+        else:
+            return NONE_COMPRESSOR, None

    def decompress(self, idata):
        if not isinstance(idata, bytes):
@ -306,44 +377,54 @@ class Auto(CompressorBase):

    def _decide(self, data):
        """
-        Decides what to do with *data*. Returns (compressor, lz4_data).
+        Decides what to do with *data*. Returns (compressor, compressed_data).

-        *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
+        *compressor* is the compressor that is decided to be best suited to compress
+        *data*, *compressed_data* is the result of *data* being compressed by a
+        compressor, which may or may not be *compressor*!
+
+        There are three possible outcomes of the decision process:
+        * *data* compresses well enough for trying the more expensive compressor
+          set on instantiation to make sense.
+          In this case, (expensive_compressor_class, lz4_compressed_data) is
+          returned.
+        * *data* compresses only slightly using the LZ4 compressor, thus trying
+          the more expensive compressor for potentially little gain does not
+          make sense.
+          In this case, (LZ4_COMPRESSOR, lz4_compressed_data) is returned.
+        * *data* does not compress at all using LZ4, in this case
+          (NONE_COMPRESSOR, none_compressed_data) is returned.
+
+        Note: While it makes no sense, the expensive compressor may well be set
+        to the LZ4 compressor.
        """
-        lz4_data = LZ4_COMPRESSOR.compress(data)
-        # lz4_data includes the compression type header, while data does not yet
-        ratio = len(lz4_data) / (len(data) + 2)
+        compressor, compressed_data = LZ4_COMPRESSOR.decide_compress(data)
+        # compressed_data includes the compression type header, while data does not yet
+        ratio = len(compressed_data) / (len(data) + 2)
        if ratio < 0.97:
-            return self.compressor, lz4_data
-        elif ratio < 1:
-            return LZ4_COMPRESSOR, lz4_data
+            return self.compressor, compressed_data
        else:
-            return NONE_COMPRESSOR, None
+            return compressor, compressed_data

    def decide(self, data):
        return self._decide(data)[0]

    def compress(self, data):
-        compressor, lz4_data = self._decide(data)
-        if compressor is LZ4_COMPRESSOR:
+        compressor, cheap_compressed_data = self._decide(data)
+        if compressor in (LZ4_COMPRESSOR, NONE_COMPRESSOR):
            # we know that trying to compress with expensive compressor is likely pointless,
-            # but lz4 managed to at least squeeze the data a bit.
-            return lz4_data
-        if compressor is NONE_COMPRESSOR:
-            # we know that trying to compress with expensive compressor is likely pointless
-            # and also lz4 did not manage to squeeze the data (not even a bit).
-            uncompressed_data = compressor.compress(data)
-            return uncompressed_data
+            # so we fallback to return the cheap compressed data.
+            return cheap_compressed_data
        # if we get here, the decider decided to try the expensive compressor.
-        # we also know that lz4_data is smaller than uncompressed data.
-        exp_compressed_data = compressor.compress(data)
-        ratio = len(exp_compressed_data) / len(lz4_data)
+        # we also know that the compressed data returned by the decider is lz4 compressed.
+        expensive_compressed_data = compressor.compress(data)
+        ratio = len(expensive_compressed_data) / len(cheap_compressed_data)
        if ratio < 0.99:
            # the expensive compressor managed to squeeze the data significantly better than lz4.
-            return exp_compressed_data
+            return expensive_compressed_data
        else:
            # otherwise let's just store the lz4 data, which decompresses extremely fast.
-            return lz4_data
+            return cheap_compressed_data

    def decompress(self, data):
        raise NotImplementedError
--- a/src/borg/remote.py
+++ b/src/borg/remote.py
@ -16,7 +16,7 @@ import traceback
 from subprocess import Popen, PIPE

 from . import __version__
-from .compress import LZ4
+from .compress import Compressor
 from .constants import *  # NOQA
 from .helpers import Error, IntegrityError
 from .helpers import bin_to_hex
@ -1215,7 +1215,7 @@ def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None
        key = decrypted_cache
        # 32 bit csize, 64 bit (8 byte) xxh64
        cache_struct = struct.Struct('=I8s')
-        compressor = LZ4()
+        compressor = Compressor('lz4')

        def pack(data):
            csize, decrypted = data
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -2140,7 +2140,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):

    def test_compression_lz4_uncompressible(self):
        size, csize = self._get_sizes('lz4', compressible=False)
-        assert csize >= size
+        assert csize == size + 3  # same as compression 'none'

    def test_compression_lzma_compressible(self):
        size, csize = self._get_sizes('lzma', compressible=True)
@ -2148,7 +2148,15 @@ class ArchiverTestCase(ArchiverTestCaseBase):

    def test_compression_lzma_uncompressible(self):
        size, csize = self._get_sizes('lzma', compressible=False)
-        assert csize >= size
+        assert csize == size + 3  # same as compression 'none'
+
+    def test_compression_zstd_compressible(self):
+        size, csize = self._get_sizes('zstd', compressible=True)
+        assert csize < size * 0.1
+
+    def test_compression_zstd_uncompressible(self):
+        size, csize = self._get_sizes('zstd', compressible=False)
+        assert csize == size + 3  # same as compression 'none'

    def test_change_passphrase(self):
        self.cmd('init', '--encryption=repokey', self.repository_location)
--- a/src/borg/testsuite/compress.py
+++ b/src/borg/testsuite/compress.py
@ -43,11 +43,13 @@ def test_lz4():
    assert data == Compressor(**params).decompress(cdata)  # autodetect


-def test_lz4_buffer_allocation():
+def test_lz4_buffer_allocation(monkeypatch):
+    # disable fallback to no compression on incompressible data
+    monkeypatch.setattr(LZ4, 'decide', lambda always_compress: LZ4)
    # test with a rather huge data object to see if buffer allocation / resizing works
    data = os.urandom(5 * 2**20) * 10  # 50MiB badly compressible data
    assert len(data) == 50 * 2**20
-    c = get_compressor(name='lz4')
+    c = Compressor('lz4')
    cdata = c.compress(data)
    assert len(cdata) > len(data)
    assert data == c.decompress(cdata)
--- a/src/borg/testsuite/key.py
+++ b/src/borg/testsuite/key.py
@ -264,9 +264,8 @@ class TestKey:
        assert len(key.id_key) == 32
        plaintext = b'123456789'
        authenticated = key.encrypt(plaintext)
-        # 0x07 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
-        # block of length nine follows (the plaintext).
-        assert authenticated == b'\x07\x01\x00\x90' + plaintext
+        # 0x07 is the key TYPE, \x0000 identifies no compression.
+        assert authenticated == b'\x07\x00\x00' + plaintext

    def test_blake2_authenticated_encrypt(self, monkeypatch):
        monkeypatch.setenv('BORG_PASSPHRASE', 'test')
@ -275,9 +274,8 @@ class TestKey:
        assert len(key.id_key) == 128
        plaintext = b'123456789'
        authenticated = key.encrypt(plaintext)
-        # 0x06 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed
-        # block of length nine follows (the plaintext).
-        assert authenticated == b'\x06\x01\x00\x90' + plaintext
+        # 0x06 is the key TYPE, 0x0000 identifies no compression.
+        assert authenticated == b'\x06\x00\x00' + plaintext


 class TestPassphrase: