2017-03-31 11:39:54 +00:00
|
|
|
"""
|
|
|
|
borg.compress
|
|
|
|
=============
|
|
|
|
|
|
|
|
Compression is applied to chunks after ID hashing (so the ID is a direct function of the
|
|
|
|
plain chunk, compression is irrelevant to it), and of course before encryption.
|
|
|
|
|
|
|
|
Borg has a flexible scheme for deciding which compression to use for chunks.
|
|
|
|
|
|
|
|
First, there is a global default set by the --compression command line option,
|
|
|
|
which sets the .compressor attribute on the Key.
|
|
|
|
|
|
|
|
For chunks that emanate from files CompressionDecider1 may set a specific
|
|
|
|
Compressor based on patterns (this is the --compression-from option). This is stored
|
|
|
|
as a Compressor instance in the "compress" key in the Chunk's meta dictionary.
|
|
|
|
|
|
|
|
When compressing either the Compressor specified in the Chunk's meta dictionary
|
|
|
|
is used, or the default Compressor of the key.
|
|
|
|
|
|
|
|
The "auto" mode (e.g. --compression auto,lzma,4) is implemented as a meta Compressor,
|
|
|
|
meaning that Auto acts like a Compressor, but defers actual work to others (namely
|
|
|
|
LZ4 as a heuristic whether compression is worth it, and the specified Compressor
|
|
|
|
for the actual compression).
|
|
|
|
"""
|
|
|
|
|
2015-08-01 23:21:41 +00:00
|
|
|
import zlib
|
2017-03-31 10:02:30 +00:00
|
|
|
from collections import namedtuple
|
|
|
|
|
2015-08-02 22:31:33 +00:00
|
|
|
try:
|
|
|
|
import lzma
|
|
|
|
except ImportError:
|
|
|
|
lzma = None
|
2015-08-01 13:07:54 +00:00
|
|
|
|
2017-03-31 10:02:30 +00:00
|
|
|
from .logger import create_logger
|
2017-03-03 23:01:02 +00:00
|
|
|
from .helpers import Buffer, DecompressionError
|
2016-08-12 19:10:46 +00:00
|
|
|
|
2017-03-03 23:01:02 +00:00
|
|
|
API_VERSION = '1.1_02'
|
2016-07-31 21:09:57 +00:00
|
|
|
|
2015-08-01 13:07:54 +00:00
|
|
|
cdef extern from "lz4.h":
|
2015-08-01 23:21:41 +00:00
|
|
|
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
2015-08-01 13:07:54 +00:00
|
|
|
int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
|
2016-08-08 22:33:12 +00:00
|
|
|
int LZ4_compressBound(int inputSize) nogil
|
2015-08-01 13:07:54 +00:00
|
|
|
|
|
|
|
|
2016-08-12 19:10:46 +00:00
|
|
|
buffer = Buffer(bytearray, size=0)
|
2016-08-09 15:05:24 +00:00
|
|
|
|
|
|
|
|
2015-08-01 23:21:41 +00:00
|
|
|
cdef class CompressorBase:
|
|
|
|
"""
|
|
|
|
base class for all (de)compression classes,
|
|
|
|
also handles compression format auto detection and
|
|
|
|
adding/stripping the ID header (which enable auto detection).
|
|
|
|
"""
|
|
|
|
ID = b'\xFF\xFF' # reserved and not used
|
|
|
|
# overwrite with a unique 2-bytes bytestring in child classes
|
|
|
|
name = 'baseclass'
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def detect(cls, data):
|
|
|
|
return data.startswith(cls.ID)
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
# add ID bytes
|
|
|
|
return self.ID + data
|
|
|
|
|
|
|
|
def decompress(self, data):
|
|
|
|
# strip ID bytes
|
|
|
|
return data[2:]
|
|
|
|
|
|
|
|
|
2015-08-14 21:00:04 +00:00
|
|
|
class CNONE(CompressorBase):
|
2015-08-01 23:21:41 +00:00
|
|
|
"""
|
2015-08-14 21:00:04 +00:00
|
|
|
none - no compression, just pass through data
|
2015-08-01 23:21:41 +00:00
|
|
|
"""
|
|
|
|
ID = b'\x00\x00'
|
2015-08-14 21:00:04 +00:00
|
|
|
name = 'none'
|
2015-08-02 16:10:30 +00:00
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
return super().compress(data)
|
|
|
|
|
|
|
|
def decompress(self, data):
|
|
|
|
data = super().decompress(data)
|
|
|
|
if not isinstance(data, bytes):
|
|
|
|
data = bytes(data)
|
|
|
|
return data
|
2015-08-01 23:21:41 +00:00
|
|
|
|
|
|
|
|
2016-08-08 22:33:12 +00:00
|
|
|
class LZ4(CompressorBase):
|
2015-08-01 23:21:41 +00:00
|
|
|
"""
|
|
|
|
raw LZ4 compression / decompression (liblz4).
|
|
|
|
|
|
|
|
Features:
|
|
|
|
- lz4 is super fast
|
|
|
|
- wrapper releases CPython's GIL to support multithreaded code
|
|
|
|
- uses safe lz4 methods that never go beyond the end of the output buffer
|
|
|
|
"""
|
|
|
|
ID = b'\x01\x00'
|
|
|
|
name = 'lz4'
|
|
|
|
|
2016-08-08 22:33:12 +00:00
|
|
|
def __init__(self, **kwargs):
|
2016-08-09 15:05:24 +00:00
|
|
|
pass
|
2015-08-01 13:07:54 +00:00
|
|
|
|
|
|
|
def compress(self, idata):
|
2015-08-02 16:10:30 +00:00
|
|
|
if not isinstance(idata, bytes):
|
|
|
|
idata = bytes(idata) # code below does not work with memoryview
|
2015-08-01 13:07:54 +00:00
|
|
|
cdef int isize = len(idata)
|
2016-08-08 22:33:12 +00:00
|
|
|
cdef int osize
|
2015-08-01 13:07:54 +00:00
|
|
|
cdef char *source = idata
|
2016-08-08 22:33:12 +00:00
|
|
|
cdef char *dest
|
|
|
|
osize = LZ4_compressBound(isize)
|
2016-08-12 19:10:46 +00:00
|
|
|
buf = buffer.get(osize)
|
|
|
|
dest = <char *> buf
|
2015-08-01 13:07:54 +00:00
|
|
|
with nogil:
|
2015-08-01 23:21:41 +00:00
|
|
|
osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
|
2015-08-01 13:07:54 +00:00
|
|
|
if not osize:
|
|
|
|
raise Exception('lz4 compress failed')
|
2015-08-01 23:21:41 +00:00
|
|
|
return super().compress(dest[:osize])
|
2015-08-01 13:07:54 +00:00
|
|
|
|
|
|
|
def decompress(self, idata):
|
2015-08-02 16:10:30 +00:00
|
|
|
if not isinstance(idata, bytes):
|
|
|
|
idata = bytes(idata) # code below does not work with memoryview
|
2015-08-01 23:21:41 +00:00
|
|
|
idata = super().decompress(idata)
|
2015-08-01 13:07:54 +00:00
|
|
|
cdef int isize = len(idata)
|
2016-08-08 22:33:12 +00:00
|
|
|
cdef int osize
|
|
|
|
cdef int rsize
|
2015-08-01 23:21:41 +00:00
|
|
|
cdef char *source = idata
|
2016-08-08 22:33:12 +00:00
|
|
|
cdef char *dest
|
|
|
|
# a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
|
|
|
|
# allocate more if isize * 3 is already bigger, to avoid having to resize often.
|
|
|
|
osize = max(int(1.1 * 2**23), isize * 3)
|
|
|
|
while True:
|
2017-03-27 10:08:54 +00:00
|
|
|
try:
|
|
|
|
buf = buffer.get(osize)
|
|
|
|
except MemoryError:
|
|
|
|
raise DecompressionError('MemoryError')
|
2016-08-12 19:10:46 +00:00
|
|
|
dest = <char *> buf
|
2016-08-08 22:33:12 +00:00
|
|
|
with nogil:
|
|
|
|
rsize = LZ4_decompress_safe(source, dest, isize, osize)
|
|
|
|
if rsize >= 0:
|
|
|
|
break
|
2017-03-27 10:08:54 +00:00
|
|
|
if osize > 2 ** 27: # 128MiB (should be enough, considering max. repo obj size and very good compression)
|
2016-08-08 22:33:12 +00:00
|
|
|
# this is insane, get out of here
|
2017-03-03 23:01:02 +00:00
|
|
|
raise DecompressionError('lz4 decompress failed')
|
2016-08-08 22:33:12 +00:00
|
|
|
# likely the buffer was too small, get a bigger one:
|
|
|
|
osize = int(1.5 * osize)
|
|
|
|
return dest[:rsize]
|
2015-08-01 23:21:41 +00:00
|
|
|
|
|
|
|
|
2015-08-02 22:31:33 +00:00
|
|
|
class LZMA(CompressorBase):
|
|
|
|
"""
|
2015-12-14 22:07:06 +00:00
|
|
|
lzma compression / decompression
|
2015-08-02 22:31:33 +00:00
|
|
|
"""
|
|
|
|
ID = b'\x02\x00'
|
|
|
|
name = 'lzma'
|
|
|
|
|
|
|
|
def __init__(self, level=6, **kwargs):
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
self.level = level
|
|
|
|
if lzma is None:
|
|
|
|
raise ValueError('No lzma support found.')
|
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
# we do not need integrity checks in lzma, we do that already
|
|
|
|
data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE)
|
|
|
|
return super().compress(data)
|
|
|
|
|
|
|
|
def decompress(self, data):
|
|
|
|
data = super().decompress(data)
|
2017-03-03 23:01:02 +00:00
|
|
|
try:
|
|
|
|
return lzma.decompress(data)
|
|
|
|
except lzma.LZMAError as e:
|
|
|
|
raise DecompressionError(str(e)) from None
|
2015-08-02 22:31:33 +00:00
|
|
|
|
|
|
|
|
2015-08-01 23:21:41 +00:00
|
|
|
class ZLIB(CompressorBase):
|
|
|
|
"""
|
|
|
|
zlib compression / decompression (python stdlib)
|
|
|
|
"""
|
|
|
|
ID = b'\x08\x00' # not used here, see detect()
|
|
|
|
# avoid all 0x.8.. IDs elsewhere!
|
|
|
|
name = 'zlib'
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def detect(cls, data):
|
|
|
|
# matches misc. patterns 0x.8.. used by zlib
|
|
|
|
cmf, flg = data[:2]
|
|
|
|
is_deflate = cmf & 0x0f == 8
|
|
|
|
check_ok = (cmf * 256 + flg) % 31 == 0
|
|
|
|
return check_ok and is_deflate
|
|
|
|
|
|
|
|
def __init__(self, level=6, **kwargs):
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
self.level = level
|
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
# note: for compatibility no super call, do not add ID bytes
|
|
|
|
return zlib.compress(data, self.level)
|
|
|
|
|
|
|
|
def decompress(self, data):
|
|
|
|
# note: for compatibility no super call, do not strip ID bytes
|
2017-03-03 23:01:02 +00:00
|
|
|
try:
|
|
|
|
return zlib.decompress(data)
|
|
|
|
except zlib.error as e:
|
|
|
|
raise DecompressionError(str(e)) from None
|
2015-08-01 23:21:41 +00:00
|
|
|
|
|
|
|
|
2017-03-31 10:02:30 +00:00
|
|
|
class Auto(CompressorBase):
|
|
|
|
"""
|
|
|
|
Meta-Compressor that decides which compression to use based on LZ4's ratio.
|
|
|
|
|
|
|
|
As a meta-Compressor the actual compression is deferred to other Compressors,
|
|
|
|
therefore this Compressor has no ID, no detect() and no decompress().
|
|
|
|
"""
|
|
|
|
|
|
|
|
ID = None
|
|
|
|
name = 'auto'
|
|
|
|
|
|
|
|
logger = create_logger('borg.debug.file-compression')
|
|
|
|
|
|
|
|
def __init__(self, compressor):
|
|
|
|
super().__init__()
|
|
|
|
self.compressor = compressor
|
|
|
|
self.lz4 = get_compressor('lz4')
|
|
|
|
self.none = get_compressor('none')
|
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
lz4_data = self.lz4.compress(data)
|
|
|
|
if len(lz4_data) < 0.97 * len(data):
|
|
|
|
return self.compressor.compress(data)
|
|
|
|
elif len(lz4_data) < len(data):
|
|
|
|
return lz4_data
|
|
|
|
else:
|
|
|
|
return self.none.compress(data)
|
|
|
|
|
|
|
|
def decompress(self, data):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def detect(cls, data):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
# Maps valid compressor names to their class
|
2015-08-01 23:21:41 +00:00
|
|
|
COMPRESSOR_TABLE = {
|
2015-08-14 21:00:04 +00:00
|
|
|
CNONE.name: CNONE,
|
2015-08-01 23:21:41 +00:00
|
|
|
LZ4.name: LZ4,
|
|
|
|
ZLIB.name: ZLIB,
|
2015-08-02 22:31:33 +00:00
|
|
|
LZMA.name: LZMA,
|
2017-03-31 10:02:30 +00:00
|
|
|
Auto.name: Auto,
|
2015-08-01 23:21:41 +00:00
|
|
|
}
|
2017-03-31 10:02:30 +00:00
|
|
|
# List of possible compression types. Does not include Auto, since it is a meta-Compressor.
|
2015-08-14 21:00:04 +00:00
|
|
|
COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first
|
2015-08-01 23:21:41 +00:00
|
|
|
|
|
|
|
def get_compressor(name, **kwargs):
|
|
|
|
cls = COMPRESSOR_TABLE[name]
|
|
|
|
return cls(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
class Compressor:
|
|
|
|
"""
|
|
|
|
compresses using a compressor with given name and parameters
|
|
|
|
decompresses everything we can handle (autodetect)
|
|
|
|
"""
|
2015-08-02 16:10:30 +00:00
|
|
|
def __init__(self, name='null', **kwargs):
|
2015-08-01 23:21:41 +00:00
|
|
|
self.params = kwargs
|
|
|
|
self.compressor = get_compressor(name, **self.params)
|
|
|
|
|
|
|
|
def compress(self, data):
|
|
|
|
return self.compressor.compress(data)
|
|
|
|
|
|
|
|
def decompress(self, data):
|
2016-07-31 20:00:58 +00:00
|
|
|
compressor_cls = self.detect(data)
|
|
|
|
return compressor_cls(**self.params).decompress(data)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def detect(data):
|
2015-08-02 16:10:30 +00:00
|
|
|
hdr = bytes(data[:2]) # detect() does not work with memoryview
|
2015-08-01 23:21:41 +00:00
|
|
|
for cls in COMPRESSOR_LIST:
|
2015-08-02 16:10:30 +00:00
|
|
|
if cls.detect(hdr):
|
2016-07-31 20:00:58 +00:00
|
|
|
return cls
|
2015-08-01 23:21:41 +00:00
|
|
|
else:
|
|
|
|
raise ValueError('No decompressor for this data found: %r.', data[:2])
|
2017-03-31 10:02:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor'))
|
|
|
|
|
|
|
|
|
|
|
|
def CompressionSpec(s):
|
|
|
|
values = s.split(',')
|
|
|
|
count = len(values)
|
|
|
|
if count < 1:
|
|
|
|
raise ValueError
|
|
|
|
# --compression algo[,level]
|
|
|
|
name = values[0]
|
|
|
|
if name == 'none':
|
|
|
|
return ComprSpec(name=name, spec=None, compressor=CNONE())
|
|
|
|
elif name == 'lz4':
|
|
|
|
return ComprSpec(name=name, spec=None, compressor=LZ4())
|
|
|
|
if name in ('zlib', 'lzma', ):
|
|
|
|
if count < 2:
|
|
|
|
level = 6 # default compression level in py stdlib
|
|
|
|
elif count == 2:
|
|
|
|
level = int(values[1])
|
|
|
|
if not 0 <= level <= 9:
|
|
|
|
raise ValueError
|
|
|
|
else:
|
|
|
|
raise ValueError
|
|
|
|
return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level))
|
|
|
|
if name == 'auto':
|
|
|
|
if 2 <= count <= 3:
|
|
|
|
compression = ','.join(values[1:])
|
|
|
|
else:
|
|
|
|
raise ValueError
|
|
|
|
inner = CompressionSpec(compression)
|
|
|
|
return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor))
|
|
|
|
raise ValueError
|