compress: add tests, zlib and null compression, ID header and autodetection

2025-02-24 15:12:00 +00:00 · 2015-08-02 01:21:41 +02:00 · 2015-08-02 01:21:41 +02:00 · 746984c33b
commit 746984c33b
parent 27de1b0a43
2 changed files with 207 additions and 37 deletions
--- a/borg/compress.pyx
+++ b/borg/compress.pyx
@ -1,63 +1,91 @@
-"""
-A thin liblz4 wrapper for raw LZ4 compression / decompression.
-
-Features:
-    - lz4 is super fast
-    - wrapper releases CPython's GIL to support multithreaded code
-    - helper buffer only allocated once at instance creation and then reused
-
-But beware:
-    - this is not very generic, you MUST know the maximum uncompressed input
-      data size you will feed into the compressor / get from the decompressor!
-    - you must not do method calls to the same LZ4 instance from different
-      threads at the same time - create one LZ4 instance per thread!
-    - compress returns raw compressed data without adding any frame metadata
-      (like checksums, magics, length of data, etc.)
-    - decompress expects such raw compressed data as input
-"""
+import zlib

 from libc.stdlib cimport malloc, free


 cdef extern from "lz4.h":
-    int LZ4_compressBound(int inputSize)
-    int LZ4_compress(const char* source, char* dest, int inputSize) nogil
+    int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
    int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil


-cdef class LZ4:
+cdef class CompressorBase:
+    """
+    base class for all (de)compression classes,
+    also handles compression format auto detection and
+    adding/stripping the ID header (which enable auto detection).
+    """
+    ID = b'\xFF\xFF'  # reserved and not used
+                      # overwrite with a unique 2-bytes bytestring in child classes
+    name = 'baseclass'
+
+    @classmethod
+    def detect(cls, data):
+        return data.startswith(cls.ID)
+
+    def __init__(self, **kwargs):
+        pass
+
+    def compress(self, data):
+        # add ID bytes
+        return self.ID + data
+
+    def decompress(self, data):
+        # strip ID bytes
+        return data[2:]
+
+
+class CNULL(CompressorBase):
+    """
+    null compression, just pass through data
+    """
+    ID = b'\x00\x00'
+    name = 'null'
+    # base class does all we need
+
+
+cdef class LZ4(CompressorBase):
+    """
+    raw LZ4 compression / decompression (liblz4).
+
+    Features:
+        - lz4 is super fast
+        - wrapper releases CPython's GIL to support multithreaded code
+        - buffer given by caller, avoiding frequent reallocation and buffer duplication
+        - uses safe lz4 methods that never go beyond the end of the output buffer
+
+    But beware:
+        - this is not very generic, the given buffer MUST be large enough to
+          handle all compression or decompression output (or it will fail).
+        - you must not do method calls to the same LZ4 instance from different
+          threads at the same time - create one LZ4 instance per thread!
+    """
+    ID = b'\x01\x00'
+    name = 'lz4'
+
    cdef char *buffer  # helper buffer for (de)compression output
    cdef int bufsize  # size of this buffer
-    cdef int max_isize  # maximum compressor input size safe for this bufsize

-    def __cinit__(self, int max_isize):
-        self.max_isize = max_isize
-        # compute worst case bufsize for not compressible data:
-        self.bufsize = LZ4_compressBound(max_isize)
-        self.buffer = <char *>malloc(self.bufsize)
-        if not self.buffer:
-            raise MemoryError
-
-    def __dealloc__(self):
-        free(self.buffer)
+    def __cinit__(self, **kwargs):
+        buffer = kwargs['buffer']
+        self.buffer = buffer
+        self.bufsize = len(buffer)

    def compress(self, idata):
        cdef int isize = len(idata)
-        if isize > self.max_isize:
-            raise Exception('lz4 buffer might be too small, increase max_isize!')
-        cdef int osize
+        cdef int osize = self.bufsize
        cdef char *source = idata
        cdef char *dest = self.buffer
        with nogil:
-            osize = LZ4_compress(source, dest, isize)
+            osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
        if not osize:
            raise Exception('lz4 compress failed')
-        return dest[:osize]
+        return super().compress(dest[:osize])

    def decompress(self, idata):
+        idata = super().decompress(idata)
        cdef int isize = len(idata)
        cdef int osize = self.bufsize
-        cdef char *source = idata  # <-- does not work for memoryview idata, wants bytes
+        cdef char *source = idata
        cdef char *dest = self.buffer
        with nogil:
            osize = LZ4_decompress_safe(source, dest, isize, osize)
@ -65,3 +93,64 @@ cdef class LZ4:
            # malformed input data, buffer too small, ...
            raise Exception('lz4 decompress failed')
        return dest[:osize]
+
+
+class ZLIB(CompressorBase):
+    """
+    zlib compression / decompression (python stdlib)
+    """
+    ID = b'\x08\x00'  # not used here, see detect()
+                      # avoid all 0x.8.. IDs elsewhere!
+    name = 'zlib'
+
+    @classmethod
+    def detect(cls, data):
+        # matches misc. patterns 0x.8.. used by zlib
+        cmf, flg = data[:2]
+        is_deflate = cmf & 0x0f == 8
+        check_ok = (cmf * 256 + flg) % 31 == 0
+        return check_ok and is_deflate
+
+    def __init__(self, level=6, **kwargs):
+        super().__init__(**kwargs)
+        self.level = level
+
+    def compress(self, data):
+        # note: for compatibility no super call, do not add ID bytes
+        return zlib.compress(data, self.level)
+
+    def decompress(self, data):
+        # note: for compatibility no super call, do not strip ID bytes
+        return zlib.decompress(data)
+
+
+COMPRESSOR_TABLE = {
+    CNULL.name: CNULL,
+    LZ4.name: LZ4,
+    ZLIB.name: ZLIB,
+}
+COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ]  # check fast stuff first
+
+def get_compressor(name, **kwargs):
+    cls = COMPRESSOR_TABLE[name]
+    return cls(**kwargs)
+
+
+class Compressor:
+    """
+    compresses using a compressor with given name and parameters
+    decompresses everything we can handle (autodetect)
+    """
+    def __init__(self, name='zlib', **kwargs):
+        self.params = kwargs
+        self.compressor = get_compressor(name, **self.params)
+
+    def compress(self, data):
+        return self.compressor.compress(data)
+
+    def decompress(self, data):
+        for cls in COMPRESSOR_LIST:
+            if cls.detect(data):
+                return cls(**self.params).decompress(data)
+        else:
+            raise ValueError('No decompressor for this data found: %r.', data[:2])
--- a/borg/testsuite/compress.py
+++ b/borg/testsuite/compress.py
@ -0,0 +1,81 @@
+import zlib
+
+import pytest
+
+from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4
+
+
+buffer = bytes(2**16)
+data = b'fooooooooobaaaaaaaar'
+params = dict(name='zlib', level=6, buffer=buffer)
+
+
+def test_get_compressor():
+    c = get_compressor(name='null')
+    assert isinstance(c, CNULL)
+    c = get_compressor(name='lz4', buffer=buffer)
+    assert isinstance(c, LZ4)
+    c = get_compressor(name='zlib')
+    assert isinstance(c, ZLIB)
+    with pytest.raises(KeyError):
+        get_compressor(name='foobar')
+
+
+def test_cnull():
+    c = get_compressor(name='null')
+    cdata = c.compress(data)
+    assert len(cdata) > len(data)
+    assert data in cdata  # it's not compressed and just in there 1:1
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_lz4():
+    c = get_compressor(name='lz4', buffer=buffer)
+    cdata = c.compress(data)
+    assert len(cdata) < len(data)
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_zlib():
+    c = get_compressor(name='zlib')
+    cdata = c.compress(data)
+    assert len(cdata) < len(data)
+    assert data == c.decompress(cdata)
+    assert data == Compressor(**params).decompress(cdata)  # autodetect
+
+
+def test_autodetect_invalid():
+    with pytest.raises(ValueError):
+        Compressor(**params).decompress(b'\xff\xfftotalcrap')
+    with pytest.raises(ValueError):
+        Compressor(**params).decompress(b'\x08\x00notreallyzlib')
+
+
+def test_zlib_compat():
+    # for compatibility reasons, we do not add an extra header for zlib,
+    # nor do we expect one when decompressing / autodetecting
+    for level in range(10):
+        c = get_compressor(name='zlib', level=level)
+        cdata1 = c.compress(data)
+        cdata2 = zlib.compress(data, level)
+        assert cdata1 == cdata2
+        data2 = c.decompress(cdata2)
+        assert data == data2
+        data2 = Compressor(**params).decompress(cdata2)
+        assert data == data2
+
+
+def test_compressor():
+    for params in [
+        dict(name='null', buffer=buffer),
+        dict(name='lz4', buffer=buffer),
+        dict(name='zlib', level=0, buffer=buffer),
+        dict(name='zlib', level=6, buffer=buffer),
+        dict(name='zlib', level=9, buffer=buffer),
+    ]:
+        c = Compressor(**params)
+        assert data == c.decompress(c.compress(data))
+
+