compression: use the 2 bytes for type and level, fixes #6698

adapt borg transfer, transferred chunks are set to compression level "unknown".
2022-05-17 22:54:12 +02:00 · 2022-05-17 22:54:12 +02:00 · 6584a92c81
parent 32a3601e4a
commit 6584a92c81
3 changed files with 42 additions and 29 deletions
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -379,8 +379,13 @@ class Archiver:
            return new_item

        def upgrade_compressed_chunk(chunk):
+            level = b'\xFF'  # FF means unknown compression level
            if ZLIB_legacy.detect(chunk):
-                chunk = ZLIB.ID + chunk  # get rid of the attic legacy: prepend separate type bytes for zlib
+                ctype = ZLIB.ID
+                chunk = ctype + level + chunk  # get rid of the attic legacy: prepend separate type/level bytes
+            else:
+                ctype = chunk[0:1]
+                chunk = ctype + level + chunk[2:]  # keep type same, but set level
            return chunk

        dry_run = args.dry_run
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@ -56,16 +56,21 @@ cdef class CompressorBase:
    also handles compression format auto detection and
    adding/stripping the ID header (which enable auto detection).
    """
-    ID = b'\xFF\xFF'  # reserved and not used
-                      # overwrite with a unique 2-bytes bytestring in child classes
+    ID = b'\xFF'  # reserved and not used
+                  # overwrite with a unique 1-byte bytestring in child classes
    name = 'baseclass'

    @classmethod
    def detect(cls, data):
        return data.startswith(cls.ID)

-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        assert 0 <= level <= 255
+        if self.ID is not None:
+            self.id_level = self.ID + bytes((level, ))  # level 255 means "unknown level"
+            assert len(self.id_level) == 2
+        else:
+            self.id_level = None

    def decide(self, data):
        """
@ -85,8 +90,8 @@ cdef class CompressorBase:
        Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
        which is needed so that the correct decompressor can be used for decompression.
        """
-        # add ID bytes
-        return self.ID + data
+        # add id_level bytes
+        return self.id_level + data

    def decompress(self, data):
        """
@ -96,7 +101,7 @@ cdef class CompressorBase:
        Only handles input generated by _this_ Compressor - for a general purpose
        decompression method see *Compressor.decompress*.
        """
-        # strip ID bytes
+        # strip id_level bytes
        return data[2:]

 cdef class DecidingCompressor(CompressorBase):
@ -106,8 +111,8 @@ cdef class DecidingCompressor(CompressorBase):
    """
    name = 'decidebaseclass'

-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)

    def _decide(self, data):
        """
@ -148,9 +153,12 @@ class CNONE(CompressorBase):
    """
    none - no compression, just pass through data
    """
-    ID = b'\x00\x00'
+    ID = b'\x00'
    name = 'none'

+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for CNONE, so just say "unknown"
+
    def compress(self, data):
        return super().compress(data)

@ -170,11 +178,11 @@ class LZ4(DecidingCompressor):
        - wrapper releases CPython's GIL to support multithreaded code
        - uses safe lz4 methods that never go beyond the end of the output buffer
    """
-    ID = b'\x01\x00'
+    ID = b'\x01'
    name = 'lz4'

-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, level=255, **kwargs):
+        super().__init__(level=level, **kwargs)  # no defined levels for LZ4, so just say "unknown"

    def _decide(self, idata):
        """
@ -235,11 +243,11 @@ class LZMA(DecidingCompressor):
    """
    lzma compression / decompression
    """
-    ID = b'\x02\x00'
+    ID = b'\x02'
    name = 'lzma'

    def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
        self.level = level
        if lzma is None:
            raise ValueError('No lzma support found.')
@ -270,11 +278,11 @@ class ZSTD(DecidingCompressor):
    # This is a NOT THREAD SAFE implementation.
    # Only ONE python context must be created at a time.
    # It should work flawlessly as long as borg will call ONLY ONE compression job at time.
-    ID = b'\x03\x00'
+    ID = b'\x03'
    name = 'zstd'

    def __init__(self, level=3, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
        self.level = level

    def _decide(self, idata):
@ -335,11 +343,11 @@ class ZLIB(DecidingCompressor):
    """
    zlib compression / decompression (python stdlib)
    """
-    ID = b'\x05\x00'
+    ID = b'\x05'
    name = 'zlib'

    def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
        self.level = level

    def _decide(self, data):
@ -373,8 +381,8 @@ class ZLIB_legacy(CompressorBase):
          Newer borg uses the ZLIB class that has separate ID bytes (as all the other
          compressors) and does not need this hack.
    """
-    ID = b'\x08\x00'  # not used here, see detect()
-    # avoid all 0x.8.. IDs elsewhere!
+    ID = b'\x08'  # not used here, see detect()
+    # avoid all 0x.8 IDs elsewhere!
    name = 'zlib_legacy'

    @classmethod
@ -386,7 +394,7 @@ class ZLIB_legacy(CompressorBase):
        return check_ok and is_deflate

    def __init__(self, level=6, **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(level=level, **kwargs)
        self.level = level

    def compress(self, data):
@ -478,14 +486,14 @@ class ObfuscateSize(CompressorBase):
    """
    Meta-Compressor that obfuscates the compressed data size.
    """
-    ID = b'\x04\x00'
+    ID = b'\x04'
    name = 'obfuscate'

    header_fmt = Struct('>I')
    header_len = len(header_fmt.pack(0))

    def __init__(self, level=None, compressor=None):
-        super().__init__()
+        super().__init__(level=level)  # data will be encrypted, so we can tell the level
        self.compressor = compressor
        if level is None:
            pass  # decompression
--- a/src/borg/testsuite/key.py
+++ b/src/borg/testsuite/key.py
@ -256,8 +256,8 @@ class TestKey:
        plaintext = b'123456789'
        id = key.id_hash(plaintext)
        authenticated = key.encrypt(id, plaintext)
-        # 0x07 is the key TYPE, \x0000 identifies no compression.
-        assert authenticated == b'\x07\x00\x00' + plaintext
+        # 0x07 is the key TYPE, \x00ff identifies no compression / unknown level.
+        assert authenticated == b'\x07\x00\xff' + plaintext

    def test_blake2_authenticated_encrypt(self, monkeypatch):
        monkeypatch.setenv('BORG_PASSPHRASE', 'test')
@ -267,8 +267,8 @@ class TestKey:
        plaintext = b'123456789'
        id = key.id_hash(plaintext)
        authenticated = key.encrypt(id, plaintext)
-        # 0x06 is the key TYPE, 0x0000 identifies no compression.
-        assert authenticated == b'\x06\x00\x00' + plaintext
+        # 0x06 is the key TYPE, 0x00ff identifies no compression / unknown level.
+        assert authenticated == b'\x06\x00\xff' + plaintext


 class TestTAM: