mirror of
https://github.com/borgbackup/borg.git
synced 2025-02-24 15:12:00 +00:00
compress: add tests, zlib and null compression, ID header and autodetection
This commit is contained in:
parent
27de1b0a43
commit
746984c33b
2 changed files with 207 additions and 37 deletions
|
@ -1,63 +1,91 @@
|
|||
"""
|
||||
A thin liblz4 wrapper for raw LZ4 compression / decompression.
|
||||
|
||||
Features:
|
||||
- lz4 is super fast
|
||||
- wrapper releases CPython's GIL to support multithreaded code
|
||||
- helper buffer only allocated once at instance creation and then reused
|
||||
|
||||
But beware:
|
||||
- this is not very generic, you MUST know the maximum uncompressed input
|
||||
data size you will feed into the compressor / get from the decompressor!
|
||||
- you must not do method calls to the same LZ4 instance from different
|
||||
threads at the same time - create one LZ4 instance per thread!
|
||||
- compress returns raw compressed data without adding any frame metadata
|
||||
(like checksums, magics, length of data, etc.)
|
||||
- decompress expects such raw compressed data as input
|
||||
"""
|
||||
import zlib
|
||||
|
||||
from libc.stdlib cimport malloc, free
|
||||
|
||||
|
||||
cdef extern from "lz4.h":
|
||||
int LZ4_compressBound(int inputSize)
|
||||
int LZ4_compress(const char* source, char* dest, int inputSize) nogil
|
||||
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
||||
int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
||||
|
||||
|
||||
cdef class LZ4:
|
||||
cdef class CompressorBase:
|
||||
"""
|
||||
base class for all (de)compression classes,
|
||||
also handles compression format auto detection and
|
||||
adding/stripping the ID header (which enable auto detection).
|
||||
"""
|
||||
ID = b'\xFF\xFF' # reserved and not used
|
||||
# overwrite with a unique 2-bytes bytestring in child classes
|
||||
name = 'baseclass'
|
||||
|
||||
@classmethod
|
||||
def detect(cls, data):
|
||||
return data.startswith(cls.ID)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def compress(self, data):
|
||||
# add ID bytes
|
||||
return self.ID + data
|
||||
|
||||
def decompress(self, data):
|
||||
# strip ID bytes
|
||||
return data[2:]
|
||||
|
||||
|
||||
class CNULL(CompressorBase):
|
||||
"""
|
||||
null compression, just pass through data
|
||||
"""
|
||||
ID = b'\x00\x00'
|
||||
name = 'null'
|
||||
# base class does all we need
|
||||
|
||||
|
||||
cdef class LZ4(CompressorBase):
|
||||
"""
|
||||
raw LZ4 compression / decompression (liblz4).
|
||||
|
||||
Features:
|
||||
- lz4 is super fast
|
||||
- wrapper releases CPython's GIL to support multithreaded code
|
||||
- buffer given by caller, avoiding frequent reallocation and buffer duplication
|
||||
- uses safe lz4 methods that never go beyond the end of the output buffer
|
||||
|
||||
But beware:
|
||||
- this is not very generic, the given buffer MUST be large enough to
|
||||
handle all compression or decompression output (or it will fail).
|
||||
- you must not do method calls to the same LZ4 instance from different
|
||||
threads at the same time - create one LZ4 instance per thread!
|
||||
"""
|
||||
ID = b'\x01\x00'
|
||||
name = 'lz4'
|
||||
|
||||
cdef char *buffer # helper buffer for (de)compression output
|
||||
cdef int bufsize # size of this buffer
|
||||
cdef int max_isize # maximum compressor input size safe for this bufsize
|
||||
|
||||
def __cinit__(self, int max_isize):
|
||||
self.max_isize = max_isize
|
||||
# compute worst case bufsize for not compressible data:
|
||||
self.bufsize = LZ4_compressBound(max_isize)
|
||||
self.buffer = <char *>malloc(self.bufsize)
|
||||
if not self.buffer:
|
||||
raise MemoryError
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.buffer)
|
||||
def __cinit__(self, **kwargs):
|
||||
buffer = kwargs['buffer']
|
||||
self.buffer = buffer
|
||||
self.bufsize = len(buffer)
|
||||
|
||||
def compress(self, idata):
|
||||
cdef int isize = len(idata)
|
||||
if isize > self.max_isize:
|
||||
raise Exception('lz4 buffer might be too small, increase max_isize!')
|
||||
cdef int osize
|
||||
cdef int osize = self.bufsize
|
||||
cdef char *source = idata
|
||||
cdef char *dest = self.buffer
|
||||
with nogil:
|
||||
osize = LZ4_compress(source, dest, isize)
|
||||
osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
|
||||
if not osize:
|
||||
raise Exception('lz4 compress failed')
|
||||
return dest[:osize]
|
||||
return super().compress(dest[:osize])
|
||||
|
||||
def decompress(self, idata):
|
||||
idata = super().decompress(idata)
|
||||
cdef int isize = len(idata)
|
||||
cdef int osize = self.bufsize
|
||||
cdef char *source = idata # <-- does not work for memoryview idata, wants bytes
|
||||
cdef char *source = idata
|
||||
cdef char *dest = self.buffer
|
||||
with nogil:
|
||||
osize = LZ4_decompress_safe(source, dest, isize, osize)
|
||||
|
@ -65,3 +93,64 @@ cdef class LZ4:
|
|||
# malformed input data, buffer too small, ...
|
||||
raise Exception('lz4 decompress failed')
|
||||
return dest[:osize]
|
||||
|
||||
|
||||
class ZLIB(CompressorBase):
|
||||
"""
|
||||
zlib compression / decompression (python stdlib)
|
||||
"""
|
||||
ID = b'\x08\x00' # not used here, see detect()
|
||||
# avoid all 0x.8.. IDs elsewhere!
|
||||
name = 'zlib'
|
||||
|
||||
@classmethod
|
||||
def detect(cls, data):
|
||||
# matches misc. patterns 0x.8.. used by zlib
|
||||
cmf, flg = data[:2]
|
||||
is_deflate = cmf & 0x0f == 8
|
||||
check_ok = (cmf * 256 + flg) % 31 == 0
|
||||
return check_ok and is_deflate
|
||||
|
||||
def __init__(self, level=6, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.level = level
|
||||
|
||||
def compress(self, data):
|
||||
# note: for compatibility no super call, do not add ID bytes
|
||||
return zlib.compress(data, self.level)
|
||||
|
||||
def decompress(self, data):
|
||||
# note: for compatibility no super call, do not strip ID bytes
|
||||
return zlib.decompress(data)
|
||||
|
||||
|
||||
COMPRESSOR_TABLE = {
|
||||
CNULL.name: CNULL,
|
||||
LZ4.name: LZ4,
|
||||
ZLIB.name: ZLIB,
|
||||
}
|
||||
COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ] # check fast stuff first
|
||||
|
||||
def get_compressor(name, **kwargs):
|
||||
cls = COMPRESSOR_TABLE[name]
|
||||
return cls(**kwargs)
|
||||
|
||||
|
||||
class Compressor:
|
||||
"""
|
||||
compresses using a compressor with given name and parameters
|
||||
decompresses everything we can handle (autodetect)
|
||||
"""
|
||||
def __init__(self, name='zlib', **kwargs):
|
||||
self.params = kwargs
|
||||
self.compressor = get_compressor(name, **self.params)
|
||||
|
||||
def compress(self, data):
|
||||
return self.compressor.compress(data)
|
||||
|
||||
def decompress(self, data):
|
||||
for cls in COMPRESSOR_LIST:
|
||||
if cls.detect(data):
|
||||
return cls(**self.params).decompress(data)
|
||||
else:
|
||||
raise ValueError('No decompressor for this data found: %r.', data[:2])
|
||||
|
|
81
borg/testsuite/compress.py
Normal file
81
borg/testsuite/compress.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
import zlib
|
||||
|
||||
import pytest
|
||||
|
||||
from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4
|
||||
|
||||
|
||||
buffer = bytes(2**16)
|
||||
data = b'fooooooooobaaaaaaaar'
|
||||
params = dict(name='zlib', level=6, buffer=buffer)
|
||||
|
||||
|
||||
def test_get_compressor():
|
||||
c = get_compressor(name='null')
|
||||
assert isinstance(c, CNULL)
|
||||
c = get_compressor(name='lz4', buffer=buffer)
|
||||
assert isinstance(c, LZ4)
|
||||
c = get_compressor(name='zlib')
|
||||
assert isinstance(c, ZLIB)
|
||||
with pytest.raises(KeyError):
|
||||
get_compressor(name='foobar')
|
||||
|
||||
|
||||
def test_cnull():
|
||||
c = get_compressor(name='null')
|
||||
cdata = c.compress(data)
|
||||
assert len(cdata) > len(data)
|
||||
assert data in cdata # it's not compressed and just in there 1:1
|
||||
assert data == c.decompress(cdata)
|
||||
assert data == Compressor(**params).decompress(cdata) # autodetect
|
||||
|
||||
|
||||
def test_lz4():
|
||||
c = get_compressor(name='lz4', buffer=buffer)
|
||||
cdata = c.compress(data)
|
||||
assert len(cdata) < len(data)
|
||||
assert data == c.decompress(cdata)
|
||||
assert data == Compressor(**params).decompress(cdata) # autodetect
|
||||
|
||||
|
||||
def test_zlib():
|
||||
c = get_compressor(name='zlib')
|
||||
cdata = c.compress(data)
|
||||
assert len(cdata) < len(data)
|
||||
assert data == c.decompress(cdata)
|
||||
assert data == Compressor(**params).decompress(cdata) # autodetect
|
||||
|
||||
|
||||
def test_autodetect_invalid():
|
||||
with pytest.raises(ValueError):
|
||||
Compressor(**params).decompress(b'\xff\xfftotalcrap')
|
||||
with pytest.raises(ValueError):
|
||||
Compressor(**params).decompress(b'\x08\x00notreallyzlib')
|
||||
|
||||
|
||||
def test_zlib_compat():
|
||||
# for compatibility reasons, we do not add an extra header for zlib,
|
||||
# nor do we expect one when decompressing / autodetecting
|
||||
for level in range(10):
|
||||
c = get_compressor(name='zlib', level=level)
|
||||
cdata1 = c.compress(data)
|
||||
cdata2 = zlib.compress(data, level)
|
||||
assert cdata1 == cdata2
|
||||
data2 = c.decompress(cdata2)
|
||||
assert data == data2
|
||||
data2 = Compressor(**params).decompress(cdata2)
|
||||
assert data == data2
|
||||
|
||||
|
||||
def test_compressor():
|
||||
for params in [
|
||||
dict(name='null', buffer=buffer),
|
||||
dict(name='lz4', buffer=buffer),
|
||||
dict(name='zlib', level=0, buffer=buffer),
|
||||
dict(name='zlib', level=6, buffer=buffer),
|
||||
dict(name='zlib', level=9, buffer=buffer),
|
||||
]:
|
||||
c = Compressor(**params)
|
||||
assert data == c.decompress(c.compress(data))
|
||||
|
||||
|
Loading…
Reference in a new issue