From b45874bead2feff18e31c04524e0bc216ca21018 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 16 Nov 2020 22:51:55 +0100 Subject: [PATCH] ObfuscateSize compressor --- README.rst | 4 ++ docs/internals/security.rst | 21 +++++++++ src/borg/archiver.py | 31 ++++++++++++- src/borg/compress.pyx | 83 +++++++++++++++++++++++++++++++++- src/borg/helpers/checks.py | 2 +- src/borg/testsuite/compress.py | 55 ++++++++++++++++++++++ 6 files changed, 192 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 0203a55c4..ba2db75cc 100644 --- a/README.rst +++ b/README.rst @@ -59,6 +59,10 @@ Main features All data can be protected using 256-bit AES encryption, data integrity and authenticity is verified using HMAC-SHA256. Data is encrypted clientside. +**Obfuscation** + Optionally, borg can actively obfuscate e.g. the size of files / chunks to + make fingerprinting attacks more difficult. + **Compression** All data can be optionally compressed: diff --git a/docs/internals/security.rst b/docs/internals/security.rst index 8d210d169..72688bc45 100644 --- a/docs/internals/security.rst +++ b/docs/internals/security.rst @@ -426,6 +426,27 @@ he assumes that the victim also possesses (and backups into the repository) could try a brute force fingerprinting attack based on the chunk sizes in the repository to prove his assumption. +To make this more difficult, borg has an ``obfuscate`` pseudo compressor, that +will take the output of the normal compression step and tries to obfuscate +the size of that output. Of course, it can only **add** to the size, not reduce +it. Thus, the optional usage of this mechanism comes at a cost: it will make +your repository larger (ranging from a few percent larger [cheap] to ridiculously +larger [expensive], depending on the algorithm/params you wisely choose). + +The output of the compressed-size obfuscation step will then be encrypted and +authenticated, as usual. Of course, using that obfuscation would not make any +sense without encryption. Thus, the additional data added by the obfuscator +are just 0x00 bytes, which is good enough because after encryption it will +look like random anyway. + +To summarize, this is making size-based fingerprinting difficult: + +- user-selectable chunker algorithm (and parametrization) +- for the buzhash chunker: secret, random per-repo chunker seed +- user-selectable compression algorithm (and level) +- optional ``obfuscate`` pseudo compressor with different choices + of algorithm and parameters + Stored chunk proximity ---------------------- diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 8f5d3abe2..7315aca0e 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -2392,6 +2392,32 @@ def do_break_lock(self, args, repository): For compressible data, it uses the given C[,L] compression - with C[,L] being any valid compression specifier. + obfuscate,SPEC,C[,L] + Use compressed-size obfuscation to make fingerprinting attacks based on + the observable stored chunk size more difficult. + Note: + - you must combine this with encryption or it won't make any sense. + - your repo size will be bigger, of course. + + The SPEC value will determine how the size obfuscation will work: + + Relative random reciprocal size variation: + Size will increase by a factor, relative to the compressed data size. + Smaller factors are often used, larger factors rarely. + 1: factor 0.01 .. 100.0 + 2: factor 0.1 .. 1000.0 + 3: factor 1.0 .. 10000.0 + 4: factor 10.0 .. 100000.0 + 5: factor 100.0 .. 1000000.0 + 6: factor 1000.0 .. 10000000.0 + + Add a randomly sized padding up to the given size: + 110: 1kiB + ... + 120: 1MiB + ... + 123: 8MiB (max.) + Examples:: borg create --compression lz4 REPO::ARCHIVE data @@ -2400,7 +2426,10 @@ def do_break_lock(self, args, repository): borg create --compression zlib REPO::ARCHIVE data borg create --compression zlib,1 REPO::ARCHIVE data borg create --compression auto,lzma,6 REPO::ARCHIVE data - borg create --compression auto,lzma ...\n\n''') + borg create --compression auto,lzma ... + borg create --compression obfuscate,3,none ... + borg create --compression obfuscate,3,auto,zstd,10 ... + borg create --compression obfuscate,2,zstd,6 ...\n\n''') def do_help(self, parser, commands, args): if not args.topic: diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 797a3ff1a..7db240ff5 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -15,6 +15,8 @@ which compressor has been used to compress the data and dispatch to the correct decompressor. """ +import random +from struct import Struct import zlib try: @@ -23,9 +25,10 @@ except ImportError: lzma = None +from .constants import MAX_DATA_SIZE from .helpers import Buffer, DecompressionError -API_VERSION = '1.2_01' +API_VERSION = '1.2_02' cdef extern from "algorithms/lz4-libselect.h": int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) nogil @@ -433,6 +436,69 @@ class Auto(CompressorBase): raise NotImplementedError +class ObfuscateSize(CompressorBase): + """ + Meta-Compressor that obfuscates the compressed data size. + """ + ID = b'\x04\x00' + name = 'obfuscate' + + header_fmt = Struct('>I') + header_len = len(header_fmt.pack(0)) + + def __init__(self, level=None, compressor=None): + super().__init__() + self.compressor = compressor + if level is None: + pass # decompression + elif 1 <= level <= 6: + self._obfuscate = self._relative_random_reciprocal_obfuscate + self.factor = 0.001 * 10 ** level + self.min_r = 0.0001 + elif 110 <= level <= 123: + self._obfuscate = self._random_padding_obfuscate + self.max_padding_size = 2 ** (level - 100) # 1kiB .. 8MiB + + def _obfuscate(self, compr_size): + # implementations need to return the size of obfuscation data, + # that the caller shall add. + raise NotImplemented + + def _relative_random_reciprocal_obfuscate(self, compr_size): + # effect for SPEC 1: + # f = 0.01 .. 0.1 for r in 1.0 .. 0.1 == in 90% of cases + # f = 0.1 .. 1.0 for r in 0.1 .. 0.01 == in 9% of cases + # f = 1.0 .. 10.0 for r in 0.01 .. 0.001 = in 0.9% of cases + # f = 10.0 .. 100.0 for r in 0.001 .. 0.0001 == in 0.09% of cases + r = max(self.min_r, random.random()) # 0..1, but dont get too close to 0 + f = self.factor / r + return int(compr_size * f) + + def _random_padding_obfuscate(self, compr_size): + return int(self.max_padding_size * random.random()) + + def compress(self, data): + compressed_data = self.compressor.compress(data) # compress data + compr_size = len(compressed_data) + header = self.header_fmt.pack(compr_size) + addtl_size = self._obfuscate(compr_size) + addtl_size = max(0, addtl_size) # we can only make it longer, not shorter! + addtl_size = min(MAX_DATA_SIZE - 1024 - compr_size, addtl_size) # stay away from MAX_DATA_SIZE + trailer = bytes(addtl_size) + obfuscated_data = b''.join([header, compressed_data, trailer]) + return super().compress(obfuscated_data) # add ID header + + def decompress(self, data): + if not isinstance(data, memoryview): + data = memoryview(data) + obfuscated_data = super().decompress(data) # remove obfuscator ID header + compr_size = self.header_fmt.unpack(obfuscated_data[0:self.header_len])[0] + compressed_data = obfuscated_data[self.header_len:self.header_len+compr_size] + if self.compressor is None: + self.compressor = Compressor.detect(compressed_data)() + return self.compressor.decompress(compressed_data) # decompress data + + # Maps valid compressor names to their class COMPRESSOR_TABLE = { CNONE.name: CNONE, @@ -441,9 +507,10 @@ COMPRESSOR_TABLE = { LZMA.name: LZMA, Auto.name: Auto, ZSTD.name: ZSTD, + ObfuscateSize.name: ObfuscateSize, } # List of possible compression types. Does not include Auto, since it is a meta-Compressor. -COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ] # check fast stuff first +COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ] # check fast stuff first def get_compressor(name, **kwargs): cls = COMPRESSOR_TABLE[name] @@ -515,6 +582,16 @@ class CompressionSpec: else: raise ValueError self.inner = CompressionSpec(compression) + elif self.name == 'obfuscate': + if 3 <= count <= 5: + level = int(values[1]) + if not ((1 <= level <= 6) or (110 <= level <= 123)): + raise ValueError + self.level = level + compression = ','.join(values[2:]) + else: + raise ValueError + self.inner = CompressionSpec(compression) else: raise ValueError @@ -526,3 +603,5 @@ class CompressionSpec: return get_compressor(self.name, level=self.level) elif self.name == 'auto': return get_compressor(self.name, compressor=self.inner.compressor) + elif self.name == 'obfuscate': + return get_compressor(self.name, level=self.level, compressor=self.inner.compressor) diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py index 9e3dfa1cb..1fe2c1439 100644 --- a/src/borg/helpers/checks.py +++ b/src/borg/helpers/checks.py @@ -29,7 +29,7 @@ def check_extension_modules(): raise ExtensionModuleError if chunker.API_VERSION != '1.2_01': raise ExtensionModuleError - if compress.API_VERSION != '1.2_01': + if compress.API_VERSION != '1.2_02': raise ExtensionModuleError if borg.crypto.low_level.API_VERSION != '1.2_01': raise ExtensionModuleError diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index 45b438b85..0c284e144 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -140,6 +140,61 @@ def test_auto(): assert Compressor.detect(compressed) == CNONE +def test_obfuscate(): + compressor = CompressionSpec('obfuscate,1,none').compressor + data = bytes(10000) + compressed = compressor.compress(data) + # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes + assert len(data) + 8 <= len(compressed) <= len(data) * 101 + 8 + # compressing 100 times the same data should give at least 50 different result sizes + assert len(set(len(compressor.compress(data)) for i in range(100))) > 50 + + cs = CompressionSpec('obfuscate,2,lz4') + assert isinstance(cs.inner.compressor, LZ4) + compressor = cs.compressor + data = bytes(10000) + compressed = compressor.compress(data) + # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes + min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries + assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8 + # compressing 100 times the same data should give multiple different result sizes + assert len(set(len(compressor.compress(data)) for i in range(100))) > 10 + + cs = CompressionSpec('obfuscate,6,zstd,3') + assert isinstance(cs.inner.compressor, ZSTD) + compressor = cs.compressor + data = bytes(10000) + compressed = compressor.compress(data) + # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes + min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries + assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 10000001 + 8 + # compressing 100 times the same data should give multiple different result sizes + assert len(set(len(compressor.compress(data)) for i in range(100))) > 90 + + cs = CompressionSpec('obfuscate,2,auto,zstd,10') + assert isinstance(cs.inner.compressor, Auto) + compressor = cs.compressor + data = bytes(10000) + compressed = compressor.compress(data) + # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes + min_compress, max_compress = 0.2, 0.001 # estimate compression factor outer boundaries + assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8 + # compressing 100 times the same data should give multiple different result sizes + assert len(set(len(compressor.compress(data)) for i in range(100))) > 10 + + cs = CompressionSpec('obfuscate,110,none') + assert isinstance(cs.inner.compressor, CNONE) + compressor = cs.compressor + data = bytes(1000) + compressed = compressor.compress(data) + # N blocks + 2 id bytes obfuscator. 4 length bytes + assert 1000 + 6 <= len(compressed) <= 1000 + 6 + 1024 + data = bytes(1100) + compressed = compressor.compress(data) + # N blocks + 2 id bytes obfuscator. 4 length bytes + assert 1100 + 6 <= len(compressed) <= 1100 + 6 + 1024 + + def test_compression_specs(): with pytest.raises(ValueError): CompressionSpec('')