borg/borg/compress.pyx

203 lines
5.8 KiB
Cython
Raw Normal View History

import zlib
try:
import lzma
except ImportError:
lzma = None
2015-08-01 13:07:54 +00:00
2016-08-12 19:10:46 +00:00
from .helpers import Buffer
2015-08-01 13:07:54 +00:00
cdef extern from "lz4.h":
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
2015-08-01 13:07:54 +00:00
int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
int LZ4_compressBound(int inputSize) nogil
2015-08-01 13:07:54 +00:00
2016-08-12 19:10:46 +00:00
buffer = Buffer(bytearray, size=0)
cdef class CompressorBase:
"""
base class for all (de)compression classes,
also handles compression format auto detection and
adding/stripping the ID header (which enable auto detection).
"""
ID = b'\xFF\xFF' # reserved and not used
# overwrite with a unique 2-bytes bytestring in child classes
name = 'baseclass'
@classmethod
def detect(cls, data):
return data.startswith(cls.ID)
def __init__(self, **kwargs):
pass
def compress(self, data):
# add ID bytes
return self.ID + data
def decompress(self, data):
# strip ID bytes
return data[2:]
class CNONE(CompressorBase):
"""
none - no compression, just pass through data
"""
ID = b'\x00\x00'
name = 'none'
def compress(self, data):
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
if not isinstance(data, bytes):
data = bytes(data)
return data
class LZ4(CompressorBase):
"""
raw LZ4 compression / decompression (liblz4).
Features:
- lz4 is super fast
- wrapper releases CPython's GIL to support multithreaded code
- uses safe lz4 methods that never go beyond the end of the output buffer
"""
ID = b'\x01\x00'
name = 'lz4'
def __init__(self, **kwargs):
pass
2015-08-01 13:07:54 +00:00
def compress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
2015-08-01 13:07:54 +00:00
cdef int isize = len(idata)
cdef int osize
2015-08-01 13:07:54 +00:00
cdef char *source = idata
cdef char *dest
osize = LZ4_compressBound(isize)
2016-08-12 19:10:46 +00:00
buf = buffer.get(osize)
dest = <char *> buf
2015-08-01 13:07:54 +00:00
with nogil:
osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
2015-08-01 13:07:54 +00:00
if not osize:
raise Exception('lz4 compress failed')
return super().compress(dest[:osize])
2015-08-01 13:07:54 +00:00
def decompress(self, idata):
if not isinstance(idata, bytes):
idata = bytes(idata) # code below does not work with memoryview
idata = super().decompress(idata)
2015-08-01 13:07:54 +00:00
cdef int isize = len(idata)
cdef int osize
cdef int rsize
cdef char *source = idata
cdef char *dest
# a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
# allocate more if isize * 3 is already bigger, to avoid having to resize often.
osize = max(int(1.1 * 2**23), isize * 3)
while True:
2016-08-12 19:10:46 +00:00
buf = buffer.get(osize)
dest = <char *> buf
with nogil:
rsize = LZ4_decompress_safe(source, dest, isize, osize)
if rsize >= 0:
break
if osize > 2 ** 30:
# this is insane, get out of here
raise Exception('lz4 decompress failed')
# likely the buffer was too small, get a bigger one:
osize = int(1.5 * osize)
return dest[:rsize]
class LZMA(CompressorBase):
"""
lzma compression / decompression
"""
ID = b'\x02\x00'
name = 'lzma'
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
if lzma is None:
raise ValueError('No lzma support found.')
def compress(self, data):
# we do not need integrity checks in lzma, we do that already
data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
return super().compress(data)
def decompress(self, data):
data = super().decompress(data)
return lzma.decompress(data)
class ZLIB(CompressorBase):
"""
zlib compression / decompression (python stdlib)
"""
ID = b'\x08\x00' # not used here, see detect()
# avoid all 0x.8.. IDs elsewhere!
name = 'zlib'
@classmethod
def detect(cls, data):
# matches misc. patterns 0x.8.. used by zlib
cmf, flg = data[:2]
is_deflate = cmf & 0x0f == 8
check_ok = (cmf * 256 + flg) % 31 == 0
return check_ok and is_deflate
def __init__(self, level=6, **kwargs):
super().__init__(**kwargs)
self.level = level
def compress(self, data):
# note: for compatibility no super call, do not add ID bytes
return zlib.compress(data, self.level)
def decompress(self, data):
# note: for compatibility no super call, do not strip ID bytes
return zlib.decompress(data)
COMPRESSOR_TABLE = {
CNONE.name: CNONE,
LZ4.name: LZ4,
ZLIB.name: ZLIB,
LZMA.name: LZMA,
}
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
def get_compressor(name, **kwargs):
cls = COMPRESSOR_TABLE[name]
return cls(**kwargs)
class Compressor:
"""
compresses using a compressor with given name and parameters
decompresses everything we can handle (autodetect)
"""
def __init__(self, name='null', **kwargs):
self.params = kwargs
self.compressor = get_compressor(name, **self.params)
def compress(self, data):
return self.compressor.compress(data)
def decompress(self, data):
hdr = bytes(data[:2]) # detect() does not work with memoryview
for cls in COMPRESSOR_LIST:
if cls.detect(hdr):
return cls(**self.params).decompress(data)
else:
raise ValueError('No decompressor for this data found: %r.', data[:2])