From 746984c33b6349b051e84c58b667469ffcd903a3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Aug 2015 01:21:41 +0200 Subject: [PATCH] compress: add tests, zlib and null compression, ID header and autodetection --- borg/compress.pyx | 163 ++++++++++++++++++++++++++++--------- borg/testsuite/compress.py | 81 ++++++++++++++++++ 2 files changed, 207 insertions(+), 37 deletions(-) create mode 100644 borg/testsuite/compress.py diff --git a/borg/compress.pyx b/borg/compress.pyx index 5bd5fdfcb..1ff00305f 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,63 +1,91 @@ -""" -A thin liblz4 wrapper for raw LZ4 compression / decompression. - -Features: - - lz4 is super fast - - wrapper releases CPython's GIL to support multithreaded code - - helper buffer only allocated once at instance creation and then reused - -But beware: - - this is not very generic, you MUST know the maximum uncompressed input - data size you will feed into the compressor / get from the decompressor! - - you must not do method calls to the same LZ4 instance from different - threads at the same time - create one LZ4 instance per thread! - - compress returns raw compressed data without adding any frame metadata - (like checksums, magics, length of data, etc.) - - decompress expects such raw compressed data as input -""" +import zlib from libc.stdlib cimport malloc, free cdef extern from "lz4.h": - int LZ4_compressBound(int inputSize) - int LZ4_compress(const char* source, char* dest, int inputSize) nogil + int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil -cdef class LZ4: +cdef class CompressorBase: + """ + base class for all (de)compression classes, + also handles compression format auto detection and + adding/stripping the ID header (which enable auto detection). + """ + ID = b'\xFF\xFF' # reserved and not used + # overwrite with a unique 2-bytes bytestring in child classes + name = 'baseclass' + + @classmethod + def detect(cls, data): + return data.startswith(cls.ID) + + def __init__(self, **kwargs): + pass + + def compress(self, data): + # add ID bytes + return self.ID + data + + def decompress(self, data): + # strip ID bytes + return data[2:] + + +class CNULL(CompressorBase): + """ + null compression, just pass through data + """ + ID = b'\x00\x00' + name = 'null' + # base class does all we need + + +cdef class LZ4(CompressorBase): + """ + raw LZ4 compression / decompression (liblz4). + + Features: + - lz4 is super fast + - wrapper releases CPython's GIL to support multithreaded code + - buffer given by caller, avoiding frequent reallocation and buffer duplication + - uses safe lz4 methods that never go beyond the end of the output buffer + + But beware: + - this is not very generic, the given buffer MUST be large enough to + handle all compression or decompression output (or it will fail). + - you must not do method calls to the same LZ4 instance from different + threads at the same time - create one LZ4 instance per thread! + """ + ID = b'\x01\x00' + name = 'lz4' + cdef char *buffer # helper buffer for (de)compression output cdef int bufsize # size of this buffer - cdef int max_isize # maximum compressor input size safe for this bufsize - def __cinit__(self, int max_isize): - self.max_isize = max_isize - # compute worst case bufsize for not compressible data: - self.bufsize = LZ4_compressBound(max_isize) - self.buffer = malloc(self.bufsize) - if not self.buffer: - raise MemoryError - - def __dealloc__(self): - free(self.buffer) + def __cinit__(self, **kwargs): + buffer = kwargs['buffer'] + self.buffer = buffer + self.bufsize = len(buffer) def compress(self, idata): cdef int isize = len(idata) - if isize > self.max_isize: - raise Exception('lz4 buffer might be too small, increase max_isize!') - cdef int osize + cdef int osize = self.bufsize cdef char *source = idata cdef char *dest = self.buffer with nogil: - osize = LZ4_compress(source, dest, isize) + osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: raise Exception('lz4 compress failed') - return dest[:osize] + return super().compress(dest[:osize]) def decompress(self, idata): + idata = super().decompress(idata) cdef int isize = len(idata) cdef int osize = self.bufsize - cdef char *source = idata # <-- does not work for memoryview idata, wants bytes + cdef char *source = idata cdef char *dest = self.buffer with nogil: osize = LZ4_decompress_safe(source, dest, isize, osize) @@ -65,3 +93,64 @@ cdef class LZ4: # malformed input data, buffer too small, ... raise Exception('lz4 decompress failed') return dest[:osize] + + +class ZLIB(CompressorBase): + """ + zlib compression / decompression (python stdlib) + """ + ID = b'\x08\x00' # not used here, see detect() + # avoid all 0x.8.. IDs elsewhere! + name = 'zlib' + + @classmethod + def detect(cls, data): + # matches misc. patterns 0x.8.. used by zlib + cmf, flg = data[:2] + is_deflate = cmf & 0x0f == 8 + check_ok = (cmf * 256 + flg) % 31 == 0 + return check_ok and is_deflate + + def __init__(self, level=6, **kwargs): + super().__init__(**kwargs) + self.level = level + + def compress(self, data): + # note: for compatibility no super call, do not add ID bytes + return zlib.compress(data, self.level) + + def decompress(self, data): + # note: for compatibility no super call, do not strip ID bytes + return zlib.decompress(data) + + +COMPRESSOR_TABLE = { + CNULL.name: CNULL, + LZ4.name: LZ4, + ZLIB.name: ZLIB, +} +COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ] # check fast stuff first + +def get_compressor(name, **kwargs): + cls = COMPRESSOR_TABLE[name] + return cls(**kwargs) + + +class Compressor: + """ + compresses using a compressor with given name and parameters + decompresses everything we can handle (autodetect) + """ + def __init__(self, name='zlib', **kwargs): + self.params = kwargs + self.compressor = get_compressor(name, **self.params) + + def compress(self, data): + return self.compressor.compress(data) + + def decompress(self, data): + for cls in COMPRESSOR_LIST: + if cls.detect(data): + return cls(**self.params).decompress(data) + else: + raise ValueError('No decompressor for this data found: %r.', data[:2]) diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py new file mode 100644 index 000000000..441214e7b --- /dev/null +++ b/borg/testsuite/compress.py @@ -0,0 +1,81 @@ +import zlib + +import pytest + +from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4 + + +buffer = bytes(2**16) +data = b'fooooooooobaaaaaaaar' +params = dict(name='zlib', level=6, buffer=buffer) + + +def test_get_compressor(): + c = get_compressor(name='null') + assert isinstance(c, CNULL) + c = get_compressor(name='lz4', buffer=buffer) + assert isinstance(c, LZ4) + c = get_compressor(name='zlib') + assert isinstance(c, ZLIB) + with pytest.raises(KeyError): + get_compressor(name='foobar') + + +def test_cnull(): + c = get_compressor(name='null') + cdata = c.compress(data) + assert len(cdata) > len(data) + assert data in cdata # it's not compressed and just in there 1:1 + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_lz4(): + c = get_compressor(name='lz4', buffer=buffer) + cdata = c.compress(data) + assert len(cdata) < len(data) + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_zlib(): + c = get_compressor(name='zlib') + cdata = c.compress(data) + assert len(cdata) < len(data) + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_autodetect_invalid(): + with pytest.raises(ValueError): + Compressor(**params).decompress(b'\xff\xfftotalcrap') + with pytest.raises(ValueError): + Compressor(**params).decompress(b'\x08\x00notreallyzlib') + + +def test_zlib_compat(): + # for compatibility reasons, we do not add an extra header for zlib, + # nor do we expect one when decompressing / autodetecting + for level in range(10): + c = get_compressor(name='zlib', level=level) + cdata1 = c.compress(data) + cdata2 = zlib.compress(data, level) + assert cdata1 == cdata2 + data2 = c.decompress(cdata2) + assert data == data2 + data2 = Compressor(**params).decompress(cdata2) + assert data == data2 + + +def test_compressor(): + for params in [ + dict(name='null', buffer=buffer), + dict(name='lz4', buffer=buffer), + dict(name='zlib', level=0, buffer=buffer), + dict(name='zlib', level=6, buffer=buffer), + dict(name='zlib', level=9, buffer=buffer), + ]: + c = Compressor(**params) + assert data == c.decompress(c.compress(data)) + +