ObfuscateSize compressor

This commit is contained in:
Thomas Waldmann 2020-11-16 22:51:55 +01:00
parent a2487fcdcc
commit b45874bead
6 changed files with 192 additions and 4 deletions

View File

@ -59,6 +59,10 @@ Main features
All data can be protected using 256-bit AES encryption, data integrity and
authenticity is verified using HMAC-SHA256. Data is encrypted clientside.
**Obfuscation**
Optionally, borg can actively obfuscate e.g. the size of files / chunks to
make fingerprinting attacks more difficult.
**Compression**
All data can be optionally compressed:

View File

@ -426,6 +426,27 @@ he assumes that the victim also possesses (and backups into the repository)
could try a brute force fingerprinting attack based on the chunk sizes in the
repository to prove his assumption.
To make this more difficult, borg has an ``obfuscate`` pseudo compressor, that
will take the output of the normal compression step and tries to obfuscate
the size of that output. Of course, it can only **add** to the size, not reduce
it. Thus, the optional usage of this mechanism comes at a cost: it will make
your repository larger (ranging from a few percent larger [cheap] to ridiculously
larger [expensive], depending on the algorithm/params you wisely choose).
The output of the compressed-size obfuscation step will then be encrypted and
authenticated, as usual. Of course, using that obfuscation would not make any
sense without encryption. Thus, the additional data added by the obfuscator
are just 0x00 bytes, which is good enough because after encryption it will
look like random anyway.
To summarize, this is making size-based fingerprinting difficult:
- user-selectable chunker algorithm (and parametrization)
- for the buzhash chunker: secret, random per-repo chunker seed
- user-selectable compression algorithm (and level)
- optional ``obfuscate`` pseudo compressor with different choices
of algorithm and parameters
Stored chunk proximity
----------------------

View File

@ -2392,6 +2392,32 @@ class Archiver:
For compressible data, it uses the given C[,L] compression - with C[,L]
being any valid compression specifier.
obfuscate,SPEC,C[,L]
Use compressed-size obfuscation to make fingerprinting attacks based on
the observable stored chunk size more difficult.
Note:
- you must combine this with encryption or it won't make any sense.
- your repo size will be bigger, of course.
The SPEC value will determine how the size obfuscation will work:
Relative random reciprocal size variation:
Size will increase by a factor, relative to the compressed data size.
Smaller factors are often used, larger factors rarely.
1: factor 0.01 .. 100.0
2: factor 0.1 .. 1000.0
3: factor 1.0 .. 10000.0
4: factor 10.0 .. 100000.0
5: factor 100.0 .. 1000000.0
6: factor 1000.0 .. 10000000.0
Add a randomly sized padding up to the given size:
110: 1kiB
...
120: 1MiB
...
123: 8MiB (max.)
Examples::
borg create --compression lz4 REPO::ARCHIVE data
@ -2400,7 +2426,10 @@ class Archiver:
borg create --compression zlib REPO::ARCHIVE data
borg create --compression zlib,1 REPO::ARCHIVE data
borg create --compression auto,lzma,6 REPO::ARCHIVE data
borg create --compression auto,lzma ...\n\n''')
borg create --compression auto,lzma ...
borg create --compression obfuscate,3,none ...
borg create --compression obfuscate,3,auto,zstd,10 ...
borg create --compression obfuscate,2,zstd,6 ...\n\n''')
def do_help(self, parser, commands, args):
if not args.topic:

View File

@ -15,6 +15,8 @@ which compressor has been used to compress the data and dispatch to the correct
decompressor.
"""
import random
from struct import Struct
import zlib
try:
@ -23,9 +25,10 @@ except ImportError:
lzma = None
from .constants import MAX_DATA_SIZE
from .helpers import Buffer, DecompressionError
API_VERSION = '1.2_01'
API_VERSION = '1.2_02'
cdef extern from "algorithms/lz4-libselect.h":
int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
@ -433,6 +436,69 @@ class Auto(CompressorBase):
raise NotImplementedError
class ObfuscateSize(CompressorBase):
"""
Meta-Compressor that obfuscates the compressed data size.
"""
ID = b'\x04\x00'
name = 'obfuscate'
header_fmt = Struct('>I')
header_len = len(header_fmt.pack(0))
def __init__(self, level=None, compressor=None):
super().__init__()
self.compressor = compressor
if level is None:
pass # decompression
elif 1 <= level <= 6:
self._obfuscate = self._relative_random_reciprocal_obfuscate
self.factor = 0.001 * 10 ** level
self.min_r = 0.0001
elif 110 <= level <= 123:
self._obfuscate = self._random_padding_obfuscate
self.max_padding_size = 2 ** (level - 100) # 1kiB .. 8MiB
def _obfuscate(self, compr_size):
# implementations need to return the size of obfuscation data,
# that the caller shall add.
raise NotImplemented
def _relative_random_reciprocal_obfuscate(self, compr_size):
# effect for SPEC 1:
# f = 0.01 .. 0.1 for r in 1.0 .. 0.1 == in 90% of cases
# f = 0.1 .. 1.0 for r in 0.1 .. 0.01 == in 9% of cases
# f = 1.0 .. 10.0 for r in 0.01 .. 0.001 = in 0.9% of cases
# f = 10.0 .. 100.0 for r in 0.001 .. 0.0001 == in 0.09% of cases
r = max(self.min_r, random.random()) # 0..1, but dont get too close to 0
f = self.factor / r
return int(compr_size * f)
def _random_padding_obfuscate(self, compr_size):
return int(self.max_padding_size * random.random())
def compress(self, data):
compressed_data = self.compressor.compress(data) # compress data
compr_size = len(compressed_data)
header = self.header_fmt.pack(compr_size)
addtl_size = self._obfuscate(compr_size)
addtl_size = max(0, addtl_size) # we can only make it longer, not shorter!
addtl_size = min(MAX_DATA_SIZE - 1024 - compr_size, addtl_size) # stay away from MAX_DATA_SIZE
trailer = bytes(addtl_size)
obfuscated_data = b''.join([header, compressed_data, trailer])
return super().compress(obfuscated_data) # add ID header
def decompress(self, data):
if not isinstance(data, memoryview):
data = memoryview(data)
obfuscated_data = super().decompress(data) # remove obfuscator ID header
compr_size = self.header_fmt.unpack(obfuscated_data[0:self.header_len])[0]
compressed_data = obfuscated_data[self.header_len:self.header_len+compr_size]
if self.compressor is None:
self.compressor = Compressor.detect(compressed_data)()
return self.compressor.decompress(compressed_data) # decompress data
# Maps valid compressor names to their class
COMPRESSOR_TABLE = {
CNONE.name: CNONE,
@ -441,9 +507,10 @@ COMPRESSOR_TABLE = {
LZMA.name: LZMA,
Auto.name: Auto,
ZSTD.name: ZSTD,
ObfuscateSize.name: ObfuscateSize,
}
# List of possible compression types. Does not include Auto, since it is a meta-Compressor.
COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ] # check fast stuff first
COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ] # check fast stuff first
def get_compressor(name, **kwargs):
cls = COMPRESSOR_TABLE[name]
@ -515,6 +582,16 @@ class CompressionSpec:
else:
raise ValueError
self.inner = CompressionSpec(compression)
elif self.name == 'obfuscate':
if 3 <= count <= 5:
level = int(values[1])
if not ((1 <= level <= 6) or (110 <= level <= 123)):
raise ValueError
self.level = level
compression = ','.join(values[2:])
else:
raise ValueError
self.inner = CompressionSpec(compression)
else:
raise ValueError
@ -526,3 +603,5 @@ class CompressionSpec:
return get_compressor(self.name, level=self.level)
elif self.name == 'auto':
return get_compressor(self.name, compressor=self.inner.compressor)
elif self.name == 'obfuscate':
return get_compressor(self.name, level=self.level, compressor=self.inner.compressor)

View File

@ -29,7 +29,7 @@ def check_extension_modules():
raise ExtensionModuleError
if chunker.API_VERSION != '1.2_01':
raise ExtensionModuleError
if compress.API_VERSION != '1.2_01':
if compress.API_VERSION != '1.2_02':
raise ExtensionModuleError
if borg.crypto.low_level.API_VERSION != '1.2_01':
raise ExtensionModuleError

View File

@ -140,6 +140,61 @@ def test_auto():
assert Compressor.detect(compressed) == CNONE
def test_obfuscate():
compressor = CompressionSpec('obfuscate,1,none').compressor
data = bytes(10000)
compressed = compressor.compress(data)
# 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
assert len(data) + 8 <= len(compressed) <= len(data) * 101 + 8
# compressing 100 times the same data should give at least 50 different result sizes
assert len(set(len(compressor.compress(data)) for i in range(100))) > 50
cs = CompressionSpec('obfuscate,2,lz4')
assert isinstance(cs.inner.compressor, LZ4)
compressor = cs.compressor
data = bytes(10000)
compressed = compressor.compress(data)
# 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries
assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8
# compressing 100 times the same data should give multiple different result sizes
assert len(set(len(compressor.compress(data)) for i in range(100))) > 10
cs = CompressionSpec('obfuscate,6,zstd,3')
assert isinstance(cs.inner.compressor, ZSTD)
compressor = cs.compressor
data = bytes(10000)
compressed = compressor.compress(data)
# 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries
assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 10000001 + 8
# compressing 100 times the same data should give multiple different result sizes
assert len(set(len(compressor.compress(data)) for i in range(100))) > 90
cs = CompressionSpec('obfuscate,2,auto,zstd,10')
assert isinstance(cs.inner.compressor, Auto)
compressor = cs.compressor
data = bytes(10000)
compressed = compressor.compress(data)
# 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries
assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8
# compressing 100 times the same data should give multiple different result sizes
assert len(set(len(compressor.compress(data)) for i in range(100))) > 10
cs = CompressionSpec('obfuscate,110,none')
assert isinstance(cs.inner.compressor, CNONE)
compressor = cs.compressor
data = bytes(1000)
compressed = compressor.compress(data)
# N blocks + 2 id bytes obfuscator. 4 length bytes
assert 1000 + 6 <= len(compressed) <= 1000 + 6 + 1024
data = bytes(1100)
compressed = compressor.compress(data)
# N blocks + 2 id bytes obfuscator. 4 length bytes
assert 1100 + 6 <= len(compressed) <= 1100 + 6 + 1024
def test_compression_specs():
with pytest.raises(ValueError):
CompressionSpec('')