borg/borg/compress.pyx

200 lines
6.0 KiB
Cython

import zlib
try:
import lzma
except ImportError:
lzma = None
cdef extern from "lz4.h":
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
cdef class CompressorBase:
"""
base class for all (de)compression classes,
also handles compression format auto detection and
adding/stripping the ID header (which enable auto detection).
"""
ID = b'\xFF\xFF' # reserved and not used
# overwrite with a unique 2-bytes bytestring in child classes
name = 'baseclass'
@classmethod
def detect(cls, data):
return data.startswith(cls.ID)
def __init__(self, **kwargs):
pass
def compress(self, data):
# add ID bytes
return self.ID + data
def decompress(self, data):
# strip ID bytes
return data[2:]
class CNONE(CompressorBase):
"""
none - no compression, just pass through data
"""
ID = b'\x00\x00'
name = 'none'
def compress(self, data):
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
if not isinstance(data, bytes):
data = bytes(data)
return data
cdef class LZ4(CompressorBase):
"""
raw LZ4 compression / decompression (liblz4).
Features:
- lz4 is super fast
- wrapper releases CPython's GIL to support multithreaded code
- buffer given by caller, avoiding frequent reallocation and buffer duplication
- uses safe lz4 methods that never go beyond the end of the output buffer
But beware:
- this is not very generic, the given buffer MUST be large enough to
handle all compression or decompression output (or it will fail).
- you must not do method calls to the same LZ4 instance from different
threads at the same time - create one LZ4 instance per thread!
"""
ID = b'\x01\x00'
name = 'lz4'
cdef char *buffer # helper buffer for (de)compression output
cdef int bufsize # size of this buffer
def __cinit__(self, **kwargs):
buffer = kwargs['buffer']
self.buffer = buffer
self.bufsize = len(buffer)
def compress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
cdef int isize = len(idata)
cdef int osize = self.bufsize
cdef char *source = idata
cdef char *dest = self.buffer
with nogil:
osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
if not osize:
raise Exception('lz4 compress failed')
return super().compress(dest[:osize])
def decompress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
idata = super().decompress(idata)
cdef int isize = len(idata)
cdef int osize = self.bufsize
cdef char *source = idata
cdef char *dest = self.buffer
with nogil:
osize = LZ4_decompress_safe(source, dest, isize, osize)
if osize < 0:
# malformed input data, buffer too small, ...
raise Exception('lz4 decompress failed')
return dest[:osize]
class LZMA(CompressorBase):
"""
lzma compression / decompression
"""
ID = b'\x02\x00'
name = 'lzma'
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
if lzma is None:
raise ValueError('No lzma support found.')
def compress(self, data):
# we do not need integrity checks in lzma, we do that already
data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
return lzma.decompress(data)
class ZLIB(CompressorBase):
"""
zlib compression / decompression (python stdlib)
"""
ID = b'\x08\x00' # not used here, see detect()
# avoid all 0x.8.. IDs elsewhere!
name = 'zlib'
@classmethod
def detect(cls, data):
# matches misc. patterns 0x.8.. used by zlib
cmf, flg = data[:2]
is_deflate = cmf & 0x0f == 8
check_ok = (cmf * 256 + flg) % 31 == 0
return check_ok and is_deflate
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
def compress(self, data):
# note: for compatibility no super call, do not add ID bytes
return zlib.compress(data, self.level)
def decompress(self, data):
# note: for compatibility no super call, do not strip ID bytes
return zlib.decompress(data)
COMPRESSOR_TABLE = {
CNONE.name: CNONE,
LZ4.name: LZ4,
ZLIB.name: ZLIB,
LZMA.name: LZMA,
}
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
def get_compressor(name, **kwargs):
cls = COMPRESSOR_TABLE[name]
return cls(**kwargs)
class Compressor:
"""
compresses using a compressor with given name and parameters
decompresses everything we can handle (autodetect)
"""
def __init__(self, name='null', **kwargs):
self.params = kwargs
self.compressor = get_compressor(name, **self.params)
def compress(self, data):
return self.compressor.compress(data)
def decompress(self, data):
hdr = bytes(data[:2]) # detect() does not work with memoryview
for cls in COMPRESSOR_LIST:
if cls.detect(hdr):
return cls(**self.params).decompress(data)
else:
raise ValueError('No decompressor for this data found: %r.', data[:2])
# a buffer used for (de)compression result, which can be slightly bigger
# than the chunk buffer in the worst (incompressible data) case, add 10%:
COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23