From 6c1c87f7ae8cf3235894f4cec0f40dcd16cc96ba Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 6 Aug 2016 01:28:02 +0200
Subject: [PATCH 1/5] add forgotten usage help file from build_usage

---
 docs/usage/debug-dump-repo-objs.rst.inc | 38 +++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 docs/usage/debug-dump-repo-objs.rst.inc

diff --git a/docs/usage/debug-dump-repo-objs.rst.inc b/docs/usage/debug-dump-repo-objs.rst.inc
new file mode 100644
index 000000000..4fcd45ae8
--- /dev/null
+++ b/docs/usage/debug-dump-repo-objs.rst.inc
@@ -0,0 +1,38 @@
+.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
+
+.. _borg_debug-dump-repo-objs:
+
+borg debug-dump-repo-objs
+-------------------------
+::
+
+    usage: borg debug-dump-repo-objs [-h] [--critical] [--error] [--warning]
+                                     [--info] [--debug] [--lock-wait N]
+                                     [--show-rc] [--no-files-cache] [--umask M]
+                                     [--remote-path PATH]
+                                     REPOSITORY
+    
+    dump (decrypted, decompressed) repo objects
+    
+    positional arguments:
+      REPOSITORY            repo to dump
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+      --critical            work on log level CRITICAL
+      --error               work on log level ERROR
+      --warning             work on log level WARNING (default)
+      --info, -v, --verbose
+                            work on log level INFO
+      --debug               work on log level DEBUG
+      --lock-wait N         wait for the lock, but max. N seconds (default: 1).
+      --show-rc             show/log the return code (rc)
+      --no-files-cache      do not load/update the file metadata cache used to
+                            detect unchanged files
+      --umask M             set umask to M (local and remote, default: 0077)
+      --remote-path PATH    set remote path to executable (default: "borg")
+    
+Description
+~~~~~~~~~~~
+
+This command dumps raw (but decrypted and decompressed) repo objects to files.

From d3000a7e5de952ed5096ccb6c46f0211fde93754 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 9 Aug 2016 00:33:12 +0200
Subject: [PATCH 2/5] LZ4: dynamically enlarge the (de)compression buffer,
 fixes #1453

the statically allocated COMPR_BUFFER was right size for chunks,
but not for the archive item which could get larger if you have
many millions of files/dirs.
---
 borg/archiver.py           |  6 ++--
 borg/compress.pyx          | 60 ++++++++++++++++++++------------------
 borg/helpers.py            |  2 --
 borg/key.py                |  4 +--
 borg/testsuite/compress.py | 29 +++++++++++-------
 5 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/borg/archiver.py b/borg/archiver.py
index bfd56bf0b..41373e259 100644
--- a/borg/archiver.py
+++ b/borg/archiver.py
@@ -25,7 +25,7 @@
     EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, log_multi, PatternMatcher, ErrorIgnoringTextIOWrapper
 from .logger import create_logger, setup_logging
 logger = create_logger()
-from .compress import Compressor, COMPR_BUFFER
+from .compress import Compressor
 from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
 from .repository import Repository
 from .cache import Cache
@@ -240,9 +240,7 @@ def create_inner(archive, cache):
         dry_run = args.dry_run
         t0 = datetime.utcnow()
         if not dry_run:
-            compr_args = dict(buffer=COMPR_BUFFER)
-            compr_args.update(args.compression)
-            key.compressor = Compressor(**compr_args)
+            key.compressor = Compressor(**args.compression)
             with Cache(repository, key, manifest, do_files=args.cache_files, lock_wait=self.lock_wait) as cache:
                 archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
                                   create=True, checkpoint_interval=args.checkpoint_interval,
diff --git a/borg/compress.pyx b/borg/compress.pyx
index 3bb88def7..1330fbf2f 100644
--- a/borg/compress.pyx
+++ b/borg/compress.pyx
@@ -7,6 +7,7 @@ except ImportError:
 cdef extern from "lz4.h":
     int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
     int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
+    int LZ4_compressBound(int inputSize) nogil
 
 
 cdef class CompressorBase:
@@ -52,40 +53,35 @@ class CNONE(CompressorBase):
         return data
 
 
-cdef class LZ4(CompressorBase):
+class LZ4(CompressorBase):
     """
     raw LZ4 compression / decompression (liblz4).
 
     Features:
         - lz4 is super fast
         - wrapper releases CPython's GIL to support multithreaded code
-        - buffer given by caller, avoiding frequent reallocation and buffer duplication
         - uses safe lz4 methods that never go beyond the end of the output buffer
-
-    But beware:
-        - this is not very generic, the given buffer MUST be large enough to
-          handle all compression or decompression output (or it will fail).
-        - you must not do method calls to the same LZ4 instance from different
-          threads at the same time - create one LZ4 instance per thread!
     """
     ID = b'\x01\x00'
     name = 'lz4'
 
-    cdef char *buffer  # helper buffer for (de)compression output
-    cdef int bufsize  # size of this buffer
+    def __init__(self, **kwargs):
+        self.buffer = None
 
-    def __cinit__(self, **kwargs):
-        buffer = kwargs['buffer']
-        self.buffer = buffer
-        self.bufsize = len(buffer)
+    def _create_buffer(self, size):
+        # we keep a reference to the buffer until this instance is destroyed
+        self.buffer = bytes(int(size))
 
     def compress(self, idata):
         if not isinstance(idata, bytes):
             idata = bytes(idata)  # code below does not work with memoryview
         cdef int isize = len(idata)
-        cdef int osize = self.bufsize
+        cdef int osize
         cdef char *source = idata
-        cdef char *dest = self.buffer
+        cdef char *dest
+        osize = LZ4_compressBound(isize)
+        self._create_buffer(osize)
+        dest = self.buffer
         with nogil:
             osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
         if not osize:
@@ -97,15 +93,26 @@ cdef class LZ4(CompressorBase):
             idata = bytes(idata)  # code below does not work with memoryview
         idata = super().decompress(idata)
         cdef int isize = len(idata)
-        cdef int osize = self.bufsize
+        cdef int osize
+        cdef int rsize
         cdef char *source = idata
-        cdef char *dest = self.buffer
-        with nogil:
-            osize = LZ4_decompress_safe(source, dest, isize, osize)
-        if osize < 0:
-            # malformed input data, buffer too small, ...
-            raise Exception('lz4 decompress failed')
-        return dest[:osize]
+        cdef char *dest
+        # a bit more than 8MB is enough for the usual data sizes yielded by the chunker.
+        # allocate more if isize * 3 is already bigger, to avoid having to resize often.
+        osize = max(int(1.1 * 2**23), isize * 3)
+        while True:
+            self._create_buffer(osize)
+            dest = self.buffer
+            with nogil:
+                rsize = LZ4_decompress_safe(source, dest, isize, osize)
+            if rsize >= 0:
+                break
+            if osize > 2 ** 30:
+                # this is insane, get out of here
+                raise Exception('lz4 decompress failed')
+            # likely the buffer was too small, get a bigger one:
+            osize = int(1.5 * osize)
+        return dest[:rsize]
 
 
 class LZMA(CompressorBase):
@@ -192,8 +199,3 @@ class Compressor:
                 return cls(**self.params).decompress(data)
         else:
             raise ValueError('No decompressor for this data found: %r.', data[:2])
-
-
-# a buffer used for (de)compression result, which can be slightly bigger
-# than the chunk buffer in the worst (incompressible data) case, add 10%:
-COMPR_BUFFER = bytes(int(1.1 * 2 ** 23))  # CHUNK_MAX_EXP == 23
diff --git a/borg/helpers.py b/borg/helpers.py
index bacb434ba..4275d783e 100644
--- a/borg/helpers.py
+++ b/borg/helpers.py
@@ -492,8 +492,6 @@ def timestamp(s):
 def ChunkerParams(s):
     chunk_min, chunk_max, chunk_mask, window_size = s.split(',')
     if int(chunk_max) > 23:
-        # do not go beyond 2**23 (8MB) chunk size now,
-        # COMPR_BUFFER can only cope with up to this size
         raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
     return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size)
 
diff --git a/borg/key.py b/borg/key.py
index be79dfc14..95178f7c8 100644
--- a/borg/key.py
+++ b/borg/key.py
@@ -12,7 +12,7 @@
 logger = create_logger()
 
 from .crypto import AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks
-from .compress import Compressor, COMPR_BUFFER
+from .compress import Compressor
 import msgpack
 
 PREFIX = b'\0' * 8
@@ -70,7 +70,7 @@ def __init__(self, repository):
         self.TYPE_STR = bytes([self.TYPE])
         self.repository = repository
         self.target = None  # key location file path / repo obj
-        self.compressor = Compressor('none', buffer=COMPR_BUFFER)
+        self.compressor = Compressor('none')
 
     def id_hash(self, data):
         """Return HMAC hash using the "id" HMAC key
diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py
index 1a4353583..ff9d42713 100644
--- a/borg/testsuite/compress.py
+++ b/borg/testsuite/compress.py
@@ -1,3 +1,4 @@
+import os
 import zlib
 try:
     import lzma
@@ -11,13 +12,13 @@
 
 buffer = bytes(2**16)
 data = b'fooooooooobaaaaaaaar' * 10
-params = dict(name='zlib', level=6, buffer=buffer)
+params = dict(name='zlib', level=6)
 
 
 def test_get_compressor():
     c = get_compressor(name='none')
     assert isinstance(c, CNONE)
-    c = get_compressor(name='lz4', buffer=buffer)
+    c = get_compressor(name='lz4')
     assert isinstance(c, LZ4)
     c = get_compressor(name='zlib')
     assert isinstance(c, ZLIB)
@@ -35,13 +36,21 @@ def test_cnull():
 
 
 def test_lz4():
-    c = get_compressor(name='lz4', buffer=buffer)
+    c = get_compressor(name='lz4')
     cdata = c.compress(data)
     assert len(cdata) < len(data)
     assert data == c.decompress(cdata)
     assert data == Compressor(**params).decompress(cdata)  # autodetect
 
 
+def test_lz4_buffer_allocation():
+    # test with a rather huge data object to see if buffer allocation / resizing works
+    data = os.urandom(50 * 2**20)  # 50MiB incompressible data
+    c = get_compressor(name='lz4')
+    cdata = c.compress(data)
+    assert data == c.decompress(cdata)
+
+
 def test_zlib():
     c = get_compressor(name='zlib')
     cdata = c.compress(data)
@@ -83,16 +92,16 @@ def test_zlib_compat():
 
 def test_compressor():
     params_list = [
-        dict(name='none', buffer=buffer),
-        dict(name='lz4', buffer=buffer),
-        dict(name='zlib', level=0, buffer=buffer),
-        dict(name='zlib', level=6, buffer=buffer),
-        dict(name='zlib', level=9, buffer=buffer),
+        dict(name='none'),
+        dict(name='lz4'),
+        dict(name='zlib', level=0),
+        dict(name='zlib', level=6),
+        dict(name='zlib', level=9),
     ]
     if lzma:
         params_list += [
-            dict(name='lzma', level=0, buffer=buffer),
-            dict(name='lzma', level=6, buffer=buffer),
+            dict(name='lzma', level=0),
+            dict(name='lzma', level=6),
             # we do not test lzma on level 9 because of the huge memory needs
         ]
     for params in params_list:

From b0e7bb5ddc41c71103cc83fbcf5b452133bb700e Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 9 Aug 2016 17:05:24 +0200
Subject: [PATCH 3/5] fixup: use thread-local buffer

start with 0 bytes length (saves memory in case lz4 is not used).
always grow when a bigger buffer is needed.
avoid per-call reallocation / freeing / garbage.
---
 borg/compress.pyx | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/borg/compress.pyx b/borg/compress.pyx
index 1330fbf2f..13955a86b 100644
--- a/borg/compress.pyx
+++ b/borg/compress.pyx
@@ -1,3 +1,4 @@
+import threading
 import zlib
 try:
     import lzma
@@ -10,6 +11,17 @@ cdef extern from "lz4.h":
     int LZ4_compressBound(int inputSize) nogil
 
 
+thread_local = threading.local()
+thread_local.buffer = bytes()
+
+
+cdef char *get_buffer(size):
+    size = int(size)
+    if len(thread_local.buffer) < size:
+        thread_local.buffer = bytes(size)
+    return <char *> thread_local.buffer
+
+
 cdef class CompressorBase:
     """
     base class for all (de)compression classes,
@@ -66,11 +78,7 @@ class LZ4(CompressorBase):
     name = 'lz4'
 
     def __init__(self, **kwargs):
-        self.buffer = None
-
-    def _create_buffer(self, size):
-        # we keep a reference to the buffer until this instance is destroyed
-        self.buffer = bytes(int(size))
+        pass
 
     def compress(self, idata):
         if not isinstance(idata, bytes):
@@ -80,8 +88,7 @@ class LZ4(CompressorBase):
         cdef char *source = idata
         cdef char *dest
         osize = LZ4_compressBound(isize)
-        self._create_buffer(osize)
-        dest = self.buffer
+        dest = get_buffer(osize)
         with nogil:
             osize = LZ4_compress_limitedOutput(source, dest, isize, osize)
         if not osize:
@@ -101,8 +108,7 @@ class LZ4(CompressorBase):
         # allocate more if isize * 3 is already bigger, to avoid having to resize often.
         osize = max(int(1.1 * 2**23), isize * 3)
         while True:
-            self._create_buffer(osize)
-            dest = self.buffer
+            dest = get_buffer(osize)
             with nogil:
                 rsize = LZ4_decompress_safe(source, dest, isize, osize)
             if rsize >= 0:

From a360307938103e3dbd58b38b18c08df009f01ab4 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 8 Aug 2016 21:45:53 +0200
Subject: [PATCH 4/5] repo: do not put objects that we won't get, fixes #1451

we will not get() objects that have a segment entry larger than MAX_OBJECT_SIZE.
thus we should never produce such entries.

also: introduce repository.MAX_DATA_SIZE that gives the max payload size.
---
 borg/repository.py           | 9 ++++++++-
 borg/testsuite/repository.py | 9 ++++++++-
 docs/changes.rst             | 8 ++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/borg/repository.py b/borg/repository.py
index 66c0f6381..87bb4b169 100644
--- a/borg/repository.py
+++ b/borg/repository.py
@@ -731,8 +731,12 @@ def _read(self, fd, fmt, header, segment, offset, acceptable_tags):
         return size, tag, key, data
 
     def write_put(self, id, data, raise_full=False):
+        data_size = len(data)
+        if data_size > MAX_DATA_SIZE:
+            # this would push the segment entry size beyond MAX_OBJECT_SIZE.
+            raise IntegrityError('More than allowed put data [{} > {}]'.format(data_size, MAX_DATA_SIZE))
         fd = self.get_write_fd(raise_full=raise_full)
-        size = len(data) + self.put_header_fmt.size
+        size = data_size + self.put_header_fmt.size
         offset = self.offset
         header = self.header_no_crc_fmt.pack(size, TAG_PUT)
         crc = self.crc_fmt.pack(crc32(data, crc32(id, crc32(header))) & 0xffffffff)
@@ -771,3 +775,6 @@ def close_segment(self):
             self._write_fd.close()
             sync_dir(os.path.dirname(self._write_fd.name))
             self._write_fd = None
+
+
+MAX_DATA_SIZE = MAX_OBJECT_SIZE - LoggedIO.put_header_fmt.size
diff --git a/borg/testsuite/repository.py b/borg/testsuite/repository.py
index bc08e097f..c50e785bb 100644
--- a/borg/testsuite/repository.py
+++ b/borg/testsuite/repository.py
@@ -8,7 +8,7 @@
 from ..helpers import Location, IntegrityError
 from ..locking import Lock, LockFailed
 from ..remote import RemoteRepository, InvalidRPCMethod
-from ..repository import Repository, LoggedIO, TAG_COMMIT
+from ..repository import Repository, LoggedIO, TAG_COMMIT, MAX_DATA_SIZE
 from . import BaseTestCase
 
 
@@ -128,6 +128,13 @@ def test_list(self):
         self.assert_equal(second_half, all[50:])
         self.assert_equal(len(self.repository.list(limit=50)), 50)
 
+    def test_max_data_size(self):
+        max_data = b'x' * MAX_DATA_SIZE
+        self.repository.put(b'00000000000000000000000000000000', max_data)
+        self.assert_equal(self.repository.get(b'00000000000000000000000000000000'), max_data)
+        self.assert_raises(IntegrityError,
+                           lambda: self.repository.put(b'00000000000000000000000000000001', max_data + b'x'))
+
 
 class RepositoryCommitTestCase(RepositoryTestCaseBase):
 
diff --git a/docs/changes.rst b/docs/changes.rst
index ddfdb8f4e..305be063c 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -57,6 +57,14 @@ Security fixes:
 
 - fix security issue with remote repository access, #1428
 
+Bug fixes:
+
+- do not write objects to repository that are bigger than the allowed size,
+  borg will reject reading them, #1451.
+  IMPORTANT: if you created archives with many millions of files or
+             directories, please verify if you can open them successfully,
+             e.g. try a "borg list REPO::ARCHIVE".
+
 
 Version 1.0.7rc1 (2016-08-05)
 -----------------------------

From 20392f8dd960ca23cca17f52ca481b1c9ea4e514 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 8 Aug 2016 22:00:34 +0200
Subject: [PATCH 5/5] repo: split size check into too small and too big

also add a hint if somebody needs to restore an archive that has too big objects.
---
 borg/repository.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/borg/repository.py b/borg/repository.py
index 87bb4b169..686f30a7d 100644
--- a/borg/repository.py
+++ b/borg/repository.py
@@ -712,9 +712,14 @@ def _read(self, fd, fmt, header, segment, offset, acceptable_tags):
             key = None
         else:
             raise TypeError("_read called with unsupported format")
-        if size > MAX_OBJECT_SIZE or size < fmt.size:
-            raise IntegrityError('Invalid segment entry size [segment {}, offset {}]'.format(
-                segment, offset))
+        if size > MAX_OBJECT_SIZE:
+            # if you get this on an archive made with borg < 1.0.7 and millions of files and
+            # you need to restore it, you can disable this check by using "if False:" above.
+            raise IntegrityError('Invalid segment entry size {} - too big [segment {}, offset {}]'.format(
+                size, segment, offset))
+        if size < fmt.size:
+            raise IntegrityError('Invalid segment entry size {} - too small [segment {}, offset {}]'.format(
+                size, segment, offset))
         length = size - fmt.size
         data = fd.read(length)
         if len(data) != length: