From b45874bead2feff18e31c04524e0bc216ca21018 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 16 Nov 2020 22:51:55 +0100
Subject: [PATCH] ObfuscateSize compressor

---
 README.rst                     |  4 ++
 docs/internals/security.rst    | 21 +++++++++
 src/borg/archiver.py           | 31 ++++++++++++-
 src/borg/compress.pyx          | 83 +++++++++++++++++++++++++++++++++-
 src/borg/helpers/checks.py     |  2 +-
 src/borg/testsuite/compress.py | 55 ++++++++++++++++++++++
 6 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/README.rst b/README.rst
index 0203a55c4..ba2db75cc 100644
--- a/README.rst
+++ b/README.rst
@@ -59,6 +59,10 @@ Main features
     All data can be protected using 256-bit AES encryption, data integrity and
     authenticity is verified using HMAC-SHA256. Data is encrypted clientside.
 
+**Obfuscation**
+    Optionally, borg can actively obfuscate e.g. the size of files / chunks to
+    make fingerprinting attacks more difficult.
+
 **Compression**
     All data can be optionally compressed:
 
diff --git a/docs/internals/security.rst b/docs/internals/security.rst
index 8d210d169..72688bc45 100644
--- a/docs/internals/security.rst
+++ b/docs/internals/security.rst
@@ -426,6 +426,27 @@ he assumes that the victim also possesses (and backups into the repository)
 could try a brute force fingerprinting attack based on the chunk sizes in the
 repository to prove his assumption.
 
+To make this more difficult, borg has an ``obfuscate`` pseudo compressor, that
+will take the output of the normal compression step and tries to obfuscate
+the size of that output. Of course, it can only **add** to the size, not reduce
+it. Thus, the optional usage of this mechanism comes at a cost: it will make
+your repository larger (ranging from a few percent larger [cheap] to ridiculously
+larger [expensive], depending on the algorithm/params you wisely choose).
+
+The output of the compressed-size obfuscation step will then be encrypted and
+authenticated, as usual. Of course, using that obfuscation would not make any
+sense without encryption. Thus, the additional data added by the obfuscator
+are just 0x00 bytes, which is good enough because after encryption it will
+look like random anyway.
+
+To summarize, this is making size-based fingerprinting difficult:
+
+- user-selectable chunker algorithm (and parametrization)
+- for the buzhash chunker: secret, random per-repo chunker seed
+- user-selectable compression algorithm (and level)
+- optional ``obfuscate`` pseudo compressor with different choices
+  of algorithm and parameters
+
 Stored chunk proximity
 ----------------------
 
diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 8f5d3abe2..7315aca0e 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -2392,6 +2392,32 @@ def do_break_lock(self, args, repository):
             For compressible data, it uses the given C[,L] compression - with C[,L]
             being any valid compression specifier.
 
+        obfuscate,SPEC,C[,L]
+            Use compressed-size obfuscation to make fingerprinting attacks based on
+            the observable stored chunk size more difficult.
+            Note:
+            - you must combine this with encryption or it won't make any sense.
+            - your repo size will be bigger, of course.
+
+            The SPEC value will determine how the size obfuscation will work:
+
+            Relative random reciprocal size variation:
+            Size will increase by a factor, relative to the compressed data size.
+            Smaller factors are often used, larger factors rarely.
+            1: factor 0.01 .. 100.0
+            2: factor 0.1 .. 1000.0
+            3: factor 1.0 .. 10000.0
+            4: factor 10.0 .. 100000.0
+            5: factor 100.0 .. 1000000.0
+            6: factor 1000.0 .. 10000000.0
+
+            Add a randomly sized padding up to the given size:
+            110: 1kiB
+            ...
+            120: 1MiB
+            ...
+            123: 8MiB (max.)
+
         Examples::
 
             borg create --compression lz4 REPO::ARCHIVE data
@@ -2400,7 +2426,10 @@ def do_break_lock(self, args, repository):
             borg create --compression zlib REPO::ARCHIVE data
             borg create --compression zlib,1 REPO::ARCHIVE data
             borg create --compression auto,lzma,6 REPO::ARCHIVE data
-            borg create --compression auto,lzma ...\n\n''')
+            borg create --compression auto,lzma ...
+            borg create --compression obfuscate,3,none ...
+            borg create --compression obfuscate,3,auto,zstd,10 ...
+            borg create --compression obfuscate,2,zstd,6 ...\n\n''')
 
     def do_help(self, parser, commands, args):
         if not args.topic:
diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx
index 797a3ff1a..7db240ff5 100644
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@@ -15,6 +15,8 @@ which compressor has been used to compress the data and dispatch to the correct
 decompressor.
 """
 
+import random
+from struct import Struct
 import zlib
 
 try:
@@ -23,9 +25,10 @@ except ImportError:
     lzma = None
 
 
+from .constants import MAX_DATA_SIZE
 from .helpers import Buffer, DecompressionError
 
-API_VERSION = '1.2_01'
+API_VERSION = '1.2_02'
 
 cdef extern from "algorithms/lz4-libselect.h":
     int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
@@ -433,6 +436,69 @@ class Auto(CompressorBase):
         raise NotImplementedError
 
 
+class ObfuscateSize(CompressorBase):
+    """
+    Meta-Compressor that obfuscates the compressed data size.
+    """
+    ID = b'\x04\x00'
+    name = 'obfuscate'
+
+    header_fmt = Struct('>I')
+    header_len = len(header_fmt.pack(0))
+
+    def __init__(self, level=None, compressor=None):
+        super().__init__()
+        self.compressor = compressor
+        if level is None:
+            pass  # decompression
+        elif 1 <= level <= 6:
+            self._obfuscate = self._relative_random_reciprocal_obfuscate
+            self.factor = 0.001 * 10 ** level
+            self.min_r = 0.0001
+        elif 110 <= level <= 123:
+            self._obfuscate = self._random_padding_obfuscate
+            self.max_padding_size = 2 ** (level - 100)  # 1kiB .. 8MiB
+
+    def _obfuscate(self, compr_size):
+        # implementations need to return the size of obfuscation data,
+        # that the caller shall add.
+        raise NotImplemented
+
+    def _relative_random_reciprocal_obfuscate(self, compr_size):
+        # effect for SPEC 1:
+        # f = 0.01 .. 0.1 for r in 1.0 .. 0.1 == in 90% of cases
+        # f = 0.1 .. 1.0 for r in 0.1 .. 0.01 == in 9% of cases
+        # f = 1.0 .. 10.0 for r in 0.01 .. 0.001 = in 0.9% of cases
+        # f = 10.0 .. 100.0 for r in 0.001 .. 0.0001 == in 0.09% of cases
+        r = max(self.min_r, random.random())  # 0..1, but dont get too close to 0
+        f = self.factor / r
+        return int(compr_size * f)
+
+    def _random_padding_obfuscate(self, compr_size):
+        return int(self.max_padding_size * random.random())
+
+    def compress(self, data):
+        compressed_data = self.compressor.compress(data)  # compress data
+        compr_size = len(compressed_data)
+        header = self.header_fmt.pack(compr_size)
+        addtl_size = self._obfuscate(compr_size)
+        addtl_size = max(0, addtl_size)  # we can only make it longer, not shorter!
+        addtl_size = min(MAX_DATA_SIZE - 1024 - compr_size, addtl_size)  # stay away from MAX_DATA_SIZE
+        trailer = bytes(addtl_size)
+        obfuscated_data = b''.join([header, compressed_data, trailer])
+        return super().compress(obfuscated_data)  # add ID header
+
+    def decompress(self, data):
+        if not isinstance(data, memoryview):
+            data = memoryview(data)
+        obfuscated_data = super().decompress(data)  # remove obfuscator ID header
+        compr_size = self.header_fmt.unpack(obfuscated_data[0:self.header_len])[0]
+        compressed_data = obfuscated_data[self.header_len:self.header_len+compr_size]
+        if self.compressor is None:
+            self.compressor = Compressor.detect(compressed_data)()
+        return self.compressor.decompress(compressed_data)  # decompress data
+
+
 # Maps valid compressor names to their class
 COMPRESSOR_TABLE = {
     CNONE.name: CNONE,
@@ -441,9 +507,10 @@ COMPRESSOR_TABLE = {
     LZMA.name: LZMA,
     Auto.name: Auto,
     ZSTD.name: ZSTD,
+    ObfuscateSize.name: ObfuscateSize,
 }
 # List of possible compression types. Does not include Auto, since it is a meta-Compressor.
-COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ]  # check fast stuff first
+COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ]  # check fast stuff first
 
 def get_compressor(name, **kwargs):
     cls = COMPRESSOR_TABLE[name]
@@ -515,6 +582,16 @@ class CompressionSpec:
             else:
                 raise ValueError
             self.inner = CompressionSpec(compression)
+        elif self.name == 'obfuscate':
+            if 3 <= count <= 5:
+                level = int(values[1])
+                if not ((1 <= level <= 6) or (110 <= level <= 123)):
+                    raise ValueError
+                self.level = level
+                compression = ','.join(values[2:])
+            else:
+                raise ValueError
+            self.inner = CompressionSpec(compression)
         else:
             raise ValueError
 
@@ -526,3 +603,5 @@ class CompressionSpec:
             return get_compressor(self.name, level=self.level)
         elif self.name == 'auto':
             return get_compressor(self.name, compressor=self.inner.compressor)
+        elif self.name == 'obfuscate':
+            return get_compressor(self.name, level=self.level, compressor=self.inner.compressor)
diff --git a/src/borg/helpers/checks.py b/src/borg/helpers/checks.py
index 9e3dfa1cb..1fe2c1439 100644
--- a/src/borg/helpers/checks.py
+++ b/src/borg/helpers/checks.py
@@ -29,7 +29,7 @@ def check_extension_modules():
         raise ExtensionModuleError
     if chunker.API_VERSION != '1.2_01':
         raise ExtensionModuleError
-    if compress.API_VERSION != '1.2_01':
+    if compress.API_VERSION != '1.2_02':
         raise ExtensionModuleError
     if borg.crypto.low_level.API_VERSION != '1.2_01':
         raise ExtensionModuleError
diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py
index 45b438b85..0c284e144 100644
--- a/src/borg/testsuite/compress.py
+++ b/src/borg/testsuite/compress.py
@@ -140,6 +140,61 @@ def test_auto():
     assert Compressor.detect(compressed) == CNONE
 
 
+def test_obfuscate():
+    compressor = CompressionSpec('obfuscate,1,none').compressor
+    data = bytes(10000)
+    compressed = compressor.compress(data)
+    # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
+    assert len(data) + 8 <= len(compressed) <= len(data) * 101 + 8
+    # compressing 100 times the same data should give at least 50 different result sizes
+    assert len(set(len(compressor.compress(data)) for i in range(100))) > 50
+
+    cs = CompressionSpec('obfuscate,2,lz4')
+    assert isinstance(cs.inner.compressor, LZ4)
+    compressor = cs.compressor
+    data = bytes(10000)
+    compressed = compressor.compress(data)
+    # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
+    min_compress, max_compress = 0.2, 0.001  # estimate compression factor outer boundaries
+    assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8
+    # compressing 100 times the same data should give multiple different result sizes
+    assert len(set(len(compressor.compress(data)) for i in range(100))) > 10
+
+    cs = CompressionSpec('obfuscate,6,zstd,3')
+    assert isinstance(cs.inner.compressor, ZSTD)
+    compressor = cs.compressor
+    data = bytes(10000)
+    compressed = compressor.compress(data)
+    # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
+    min_compress, max_compress = 0.2, 0.001  # estimate compression factor outer boundaries
+    assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 10000001 + 8
+    # compressing 100 times the same data should give multiple different result sizes
+    assert len(set(len(compressor.compress(data)) for i in range(100))) > 90
+
+    cs = CompressionSpec('obfuscate,2,auto,zstd,10')
+    assert isinstance(cs.inner.compressor, Auto)
+    compressor = cs.compressor
+    data = bytes(10000)
+    compressed = compressor.compress(data)
+    # 2 id bytes compression, 2 id bytes obfuscator. 4 length bytes
+    min_compress, max_compress = 0.2, 0.001  # estimate compression factor outer boundaries
+    assert max_compress * len(data) + 8 <= len(compressed) <= min_compress * len(data) * 1001 + 8
+    # compressing 100 times the same data should give multiple different result sizes
+    assert len(set(len(compressor.compress(data)) for i in range(100))) > 10
+
+    cs = CompressionSpec('obfuscate,110,none')
+    assert isinstance(cs.inner.compressor, CNONE)
+    compressor = cs.compressor
+    data = bytes(1000)
+    compressed = compressor.compress(data)
+    # N blocks + 2 id bytes obfuscator. 4 length bytes
+    assert 1000 + 6 <= len(compressed) <= 1000 + 6 + 1024
+    data = bytes(1100)
+    compressed = compressor.compress(data)
+    # N blocks + 2 id bytes obfuscator. 4 length bytes
+    assert 1100 + 6 <= len(compressed) <= 1100 + 6 + 1024
+
+
 def test_compression_specs():
     with pytest.raises(ValueError):
         CompressionSpec('')