auto compression: make sure expensive compression is actually better

if it is not significantly better compressed, we just store lz4
compressed data (which we already have computed anyway), because
that at least decompressed super fast.
This commit is contained in:
Thomas Waldmann 2017-10-03 21:11:43 +02:00
parent 35c042b97b
commit 011e0fd3fa
2 changed files with 29 additions and 8 deletions

View File

@ -246,7 +246,7 @@ class Auto(CompressorBase):
lz4_data = self.lz4.compress(data) lz4_data = self.lz4.compress(data)
ratio = len(lz4_data) / len(data) ratio = len(lz4_data) / len(data)
if ratio < 0.97: if ratio < 0.97:
return self.compressor, None return self.compressor, lz4_data
elif ratio < 1: elif ratio < 1:
return self.lz4, lz4_data return self.lz4, lz4_data
else: else:
@ -257,9 +257,24 @@ class Auto(CompressorBase):
def compress(self, data): def compress(self, data):
compressor, lz4_data = self._decide(data) compressor, lz4_data = self._decide(data)
if lz4_data is None: if compressor is self.lz4:
return compressor.compress(data) # we know that trying to compress with expensive compressor is likely pointless,
# but lz4 managed to at least squeeze the data a bit.
return lz4_data
if compressor is self.none:
# we know that trying to compress with expensive compressor is likely pointless
# and also lz4 did not manage to squeeze the data (not even a bit).
uncompressed_data = compressor.compress(data)
return uncompressed_data
# if we get here, the decider decided to try the expensive compressor.
# we also know that lz4_data is smaller than uncompressed data.
exp_compressed_data = compressor.compress(data)
ratio = len(exp_compressed_data) / len(lz4_data)
if ratio < 0.99:
# the expensive compressor managed to squeeze the data significantly better than lz4.
return exp_compressed_data
else: else:
# otherwise let's just store the lz4 data, which decompresses extremely fast.
return lz4_data return lz4_data
def decompress(self, data): def decompress(self, data):

View File

@ -110,12 +110,18 @@ def test_compressor():
def test_auto(): def test_auto():
compressor = CompressionSpec('auto,zlib,9').compressor compressor_auto_zlib = CompressionSpec('auto,zlib,9').compressor
compressor_lz4 = CompressionSpec('lz4').compressor
compressor_zlib = CompressionSpec('zlib,9').compressor
data = bytes(500)
compressed_auto_zlib = compressor_auto_zlib.compress(data)
compressed_lz4 = compressor_lz4.compress(data)
compressed_zlib = compressor_zlib.compress(data)
ratio = len(compressed_zlib) / len(compressed_lz4)
assert Compressor.detect(compressed_auto_zlib) == ZLIB if ratio < 0.99 else LZ4
compressed = compressor.compress(bytes(500)) data = b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~'
assert Compressor.detect(compressed) == ZLIB compressed = compressor_auto_zlib.compress(data)
compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~')
assert Compressor.detect(compressed) == CNONE assert Compressor.detect(compressed) == CNONE