From 6c1c87f7ae8cf3235894f4cec0f40dcd16cc96ba Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 6 Aug 2016 01:28:02 +0200 Subject: [PATCH 1/5] add forgotten usage help file from build_usage --- docs/usage/debug-dump-repo-objs.rst.inc | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docs/usage/debug-dump-repo-objs.rst.inc diff --git a/docs/usage/debug-dump-repo-objs.rst.inc b/docs/usage/debug-dump-repo-objs.rst.inc new file mode 100644 index 000000000..4fcd45ae8 --- /dev/null +++ b/docs/usage/debug-dump-repo-objs.rst.inc @@ -0,0 +1,38 @@ +.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit! + +.. _borg_debug-dump-repo-objs: + +borg debug-dump-repo-objs +------------------------- +:: + + usage: borg debug-dump-repo-objs [-h] [--critical] [--error] [--warning] + [--info] [--debug] [--lock-wait N] + [--show-rc] [--no-files-cache] [--umask M] + [--remote-path PATH] + REPOSITORY + + dump (decrypted, decompressed) repo objects + + positional arguments: + REPOSITORY repo to dump + + optional arguments: + -h, --help show this help message and exit + --critical work on log level CRITICAL + --error work on log level ERROR + --warning work on log level WARNING (default) + --info, -v, --verbose + work on log level INFO + --debug work on log level DEBUG + --lock-wait N wait for the lock, but max. N seconds (default: 1). + --show-rc show/log the return code (rc) + --no-files-cache do not load/update the file metadata cache used to + detect unchanged files + --umask M set umask to M (local and remote, default: 0077) + --remote-path PATH set remote path to executable (default: "borg") + +Description +~~~~~~~~~~~ + +This command dumps raw (but decrypted and decompressed) repo objects to files. From d3000a7e5de952ed5096ccb6c46f0211fde93754 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 9 Aug 2016 00:33:12 +0200 Subject: [PATCH 2/5] LZ4: dynamically enlarge the (de)compression buffer, fixes #1453 the statically allocated COMPR_BUFFER was right size for chunks, but not for the archive item which could get larger if you have many millions of files/dirs. --- borg/archiver.py | 6 ++-- borg/compress.pyx | 60 ++++++++++++++++++++------------------ borg/helpers.py | 2 -- borg/key.py | 4 +-- borg/testsuite/compress.py | 29 +++++++++++------- 5 files changed, 54 insertions(+), 47 deletions(-) diff --git a/borg/archiver.py b/borg/archiver.py index bfd56bf0b..41373e259 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -25,7 +25,7 @@ EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, log_multi, PatternMatcher, ErrorIgnoringTextIOWrapper from .logger import create_logger, setup_logging logger = create_logger() -from .compress import Compressor, COMPR_BUFFER +from .compress import Compressor from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader from .repository import Repository from .cache import Cache @@ -240,9 +240,7 @@ def create_inner(archive, cache): dry_run = args.dry_run t0 = datetime.utcnow() if not dry_run: - compr_args = dict(buffer=COMPR_BUFFER) - compr_args.update(args.compression) - key.compressor = Compressor(**compr_args) + key.compressor = Compressor(**args.compression) with Cache(repository, key, manifest, do_files=args.cache_files, lock_wait=self.lock_wait) as cache: archive = Archive(repository, key, manifest, args.location.archive, cache=cache, create=True, checkpoint_interval=args.checkpoint_interval, diff --git a/borg/compress.pyx b/borg/compress.pyx index 3bb88def7..1330fbf2f 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -7,6 +7,7 @@ except ImportError: cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil + int LZ4_compressBound(int inputSize) nogil cdef class CompressorBase: @@ -52,40 +53,35 @@ class CNONE(CompressorBase): return data -cdef class LZ4(CompressorBase): +class LZ4(CompressorBase): """ raw LZ4 compression / decompression (liblz4). Features: - lz4 is super fast - wrapper releases CPython's GIL to support multithreaded code - - buffer given by caller, avoiding frequent reallocation and buffer duplication - uses safe lz4 methods that never go beyond the end of the output buffer - - But beware: - - this is not very generic, the given buffer MUST be large enough to - handle all compression or decompression output (or it will fail). - - you must not do method calls to the same LZ4 instance from different - threads at the same time - create one LZ4 instance per thread! """ ID = b'\x01\x00' name = 'lz4' - cdef char *buffer # helper buffer for (de)compression output - cdef int bufsize # size of this buffer + def __init__(self, **kwargs): + self.buffer = None - def __cinit__(self, **kwargs): - buffer = kwargs['buffer'] - self.buffer = buffer - self.bufsize = len(buffer) + def _create_buffer(self, size): + # we keep a reference to the buffer until this instance is destroyed + self.buffer = bytes(int(size)) def compress(self, idata): if not isinstance(idata, bytes): idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) - cdef int osize = self.bufsize + cdef int osize cdef char *source = idata - cdef char *dest = self.buffer + cdef char *dest + osize = LZ4_compressBound(isize) + self._create_buffer(osize) + dest = self.buffer with nogil: osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: @@ -97,15 +93,26 @@ cdef class LZ4(CompressorBase): idata = bytes(idata) # code below does not work with memoryview idata = super().decompress(idata) cdef int isize = len(idata) - cdef int osize = self.bufsize + cdef int osize + cdef int rsize cdef char *source = idata - cdef char *dest = self.buffer - with nogil: - osize = LZ4_decompress_safe(source, dest, isize, osize) - if osize < 0: - # malformed input data, buffer too small, ... - raise Exception('lz4 decompress failed') - return dest[:osize] + cdef char *dest + # a bit more than 8MB is enough for the usual data sizes yielded by the chunker. + # allocate more if isize * 3 is already bigger, to avoid having to resize often. + osize = max(int(1.1 * 2**23), isize * 3) + while True: + self._create_buffer(osize) + dest = self.buffer + with nogil: + rsize = LZ4_decompress_safe(source, dest, isize, osize) + if rsize >= 0: + break + if osize > 2 ** 30: + # this is insane, get out of here + raise Exception('lz4 decompress failed') + # likely the buffer was too small, get a bigger one: + osize = int(1.5 * osize) + return dest[:rsize] class LZMA(CompressorBase): @@ -192,8 +199,3 @@ class Compressor: return cls(**self.params).decompress(data) else: raise ValueError('No decompressor for this data found: %r.', data[:2]) - - -# a buffer used for (de)compression result, which can be slightly bigger -# than the chunk buffer in the worst (incompressible data) case, add 10%: -COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23 diff --git a/borg/helpers.py b/borg/helpers.py index bacb434ba..4275d783e 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -492,8 +492,6 @@ def timestamp(s): def ChunkerParams(s): chunk_min, chunk_max, chunk_mask, window_size = s.split(',') if int(chunk_max) > 23: - # do not go beyond 2**23 (8MB) chunk size now, - # COMPR_BUFFER can only cope with up to this size raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)') return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size) diff --git a/borg/key.py b/borg/key.py index be79dfc14..95178f7c8 100644 --- a/borg/key.py +++ b/borg/key.py @@ -12,7 +12,7 @@ logger = create_logger() from .crypto import AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks -from .compress import Compressor, COMPR_BUFFER +from .compress import Compressor import msgpack PREFIX = b'\0' * 8 @@ -70,7 +70,7 @@ def __init__(self, repository): self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compressor = Compressor('none', buffer=COMPR_BUFFER) + self.compressor = Compressor('none') def id_hash(self, data): """Return HMAC hash using the "id" HMAC key diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py index 1a4353583..ff9d42713 100644 --- a/borg/testsuite/compress.py +++ b/borg/testsuite/compress.py @@ -1,3 +1,4 @@ +import os import zlib try: import lzma @@ -11,13 +12,13 @@ buffer = bytes(2**16) data = b'fooooooooobaaaaaaaar' * 10 -params = dict(name='zlib', level=6, buffer=buffer) +params = dict(name='zlib', level=6) def test_get_compressor(): c = get_compressor(name='none') assert isinstance(c, CNONE) - c = get_compressor(name='lz4', buffer=buffer) + c = get_compressor(name='lz4') assert isinstance(c, LZ4) c = get_compressor(name='zlib') assert isinstance(c, ZLIB) @@ -35,13 +36,21 @@ def test_cnull(): def test_lz4(): - c = get_compressor(name='lz4', buffer=buffer) + c = get_compressor(name='lz4') cdata = c.compress(data) assert len(cdata) < len(data) assert data == c.decompress(cdata) assert data == Compressor(**params).decompress(cdata) # autodetect +def test_lz4_buffer_allocation(): + # test with a rather huge data object to see if buffer allocation / resizing works + data = os.urandom(50 * 2**20) # 50MiB incompressible data + c = get_compressor(name='lz4') + cdata = c.compress(data) + assert data == c.decompress(cdata) + + def test_zlib(): c = get_compressor(name='zlib') cdata = c.compress(data) @@ -83,16 +92,16 @@ def test_zlib_compat(): def test_compressor(): params_list = [ - dict(name='none', buffer=buffer), - dict(name='lz4', buffer=buffer), - dict(name='zlib', level=0, buffer=buffer), - dict(name='zlib', level=6, buffer=buffer), - dict(name='zlib', level=9, buffer=buffer), + dict(name='none'), + dict(name='lz4'), + dict(name='zlib', level=0), + dict(name='zlib', level=6), + dict(name='zlib', level=9), ] if lzma: params_list += [ - dict(name='lzma', level=0, buffer=buffer), - dict(name='lzma', level=6, buffer=buffer), + dict(name='lzma', level=0), + dict(name='lzma', level=6), # we do not test lzma on level 9 because of the huge memory needs ] for params in params_list: From b0e7bb5ddc41c71103cc83fbcf5b452133bb700e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 9 Aug 2016 17:05:24 +0200 Subject: [PATCH 3/5] fixup: use thread-local buffer start with 0 bytes length (saves memory in case lz4 is not used). always grow when a bigger buffer is needed. avoid per-call reallocation / freeing / garbage. --- borg/compress.pyx | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/borg/compress.pyx b/borg/compress.pyx index 1330fbf2f..13955a86b 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,3 +1,4 @@ +import threading import zlib try: import lzma @@ -10,6 +11,17 @@ cdef extern from "lz4.h": int LZ4_compressBound(int inputSize) nogil +thread_local = threading.local() +thread_local.buffer = bytes() + + +cdef char *get_buffer(size): + size = int(size) + if len(thread_local.buffer) < size: + thread_local.buffer = bytes(size) + return thread_local.buffer + + cdef class CompressorBase: """ base class for all (de)compression classes, @@ -66,11 +78,7 @@ class LZ4(CompressorBase): name = 'lz4' def __init__(self, **kwargs): - self.buffer = None - - def _create_buffer(self, size): - # we keep a reference to the buffer until this instance is destroyed - self.buffer = bytes(int(size)) + pass def compress(self, idata): if not isinstance(idata, bytes): @@ -80,8 +88,7 @@ class LZ4(CompressorBase): cdef char *source = idata cdef char *dest osize = LZ4_compressBound(isize) - self._create_buffer(osize) - dest = self.buffer + dest = get_buffer(osize) with nogil: osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: @@ -101,8 +108,7 @@ class LZ4(CompressorBase): # allocate more if isize * 3 is already bigger, to avoid having to resize often. osize = max(int(1.1 * 2**23), isize * 3) while True: - self._create_buffer(osize) - dest = self.buffer + dest = get_buffer(osize) with nogil: rsize = LZ4_decompress_safe(source, dest, isize, osize) if rsize >= 0: From a360307938103e3dbd58b38b18c08df009f01ab4 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 8 Aug 2016 21:45:53 +0200 Subject: [PATCH 4/5] repo: do not put objects that we won't get, fixes #1451 we will not get() objects that have a segment entry larger than MAX_OBJECT_SIZE. thus we should never produce such entries. also: introduce repository.MAX_DATA_SIZE that gives the max payload size. --- borg/repository.py | 9 ++++++++- borg/testsuite/repository.py | 9 ++++++++- docs/changes.rst | 8 ++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/borg/repository.py b/borg/repository.py index 66c0f6381..87bb4b169 100644 --- a/borg/repository.py +++ b/borg/repository.py @@ -731,8 +731,12 @@ def _read(self, fd, fmt, header, segment, offset, acceptable_tags): return size, tag, key, data def write_put(self, id, data, raise_full=False): + data_size = len(data) + if data_size > MAX_DATA_SIZE: + # this would push the segment entry size beyond MAX_OBJECT_SIZE. + raise IntegrityError('More than allowed put data [{} > {}]'.format(data_size, MAX_DATA_SIZE)) fd = self.get_write_fd(raise_full=raise_full) - size = len(data) + self.put_header_fmt.size + size = data_size + self.put_header_fmt.size offset = self.offset header = self.header_no_crc_fmt.pack(size, TAG_PUT) crc = self.crc_fmt.pack(crc32(data, crc32(id, crc32(header))) & 0xffffffff) @@ -771,3 +775,6 @@ def close_segment(self): self._write_fd.close() sync_dir(os.path.dirname(self._write_fd.name)) self._write_fd = None + + +MAX_DATA_SIZE = MAX_OBJECT_SIZE - LoggedIO.put_header_fmt.size diff --git a/borg/testsuite/repository.py b/borg/testsuite/repository.py index bc08e097f..c50e785bb 100644 --- a/borg/testsuite/repository.py +++ b/borg/testsuite/repository.py @@ -8,7 +8,7 @@ from ..helpers import Location, IntegrityError from ..locking import Lock, LockFailed from ..remote import RemoteRepository, InvalidRPCMethod -from ..repository import Repository, LoggedIO, TAG_COMMIT +from ..repository import Repository, LoggedIO, TAG_COMMIT, MAX_DATA_SIZE from . import BaseTestCase @@ -128,6 +128,13 @@ def test_list(self): self.assert_equal(second_half, all[50:]) self.assert_equal(len(self.repository.list(limit=50)), 50) + def test_max_data_size(self): + max_data = b'x' * MAX_DATA_SIZE + self.repository.put(b'00000000000000000000000000000000', max_data) + self.assert_equal(self.repository.get(b'00000000000000000000000000000000'), max_data) + self.assert_raises(IntegrityError, + lambda: self.repository.put(b'00000000000000000000000000000001', max_data + b'x')) + class RepositoryCommitTestCase(RepositoryTestCaseBase): diff --git a/docs/changes.rst b/docs/changes.rst index ddfdb8f4e..305be063c 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -57,6 +57,14 @@ Security fixes: - fix security issue with remote repository access, #1428 +Bug fixes: + +- do not write objects to repository that are bigger than the allowed size, + borg will reject reading them, #1451. + IMPORTANT: if you created archives with many millions of files or + directories, please verify if you can open them successfully, + e.g. try a "borg list REPO::ARCHIVE". + Version 1.0.7rc1 (2016-08-05) ----------------------------- From 20392f8dd960ca23cca17f52ca481b1c9ea4e514 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 8 Aug 2016 22:00:34 +0200 Subject: [PATCH 5/5] repo: split size check into too small and too big also add a hint if somebody needs to restore an archive that has too big objects. --- borg/repository.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/borg/repository.py b/borg/repository.py index 87bb4b169..686f30a7d 100644 --- a/borg/repository.py +++ b/borg/repository.py @@ -712,9 +712,14 @@ def _read(self, fd, fmt, header, segment, offset, acceptable_tags): key = None else: raise TypeError("_read called with unsupported format") - if size > MAX_OBJECT_SIZE or size < fmt.size: - raise IntegrityError('Invalid segment entry size [segment {}, offset {}]'.format( - segment, offset)) + if size > MAX_OBJECT_SIZE: + # if you get this on an archive made with borg < 1.0.7 and millions of files and + # you need to restore it, you can disable this check by using "if False:" above. + raise IntegrityError('Invalid segment entry size {} - too big [segment {}, offset {}]'.format( + size, segment, offset)) + if size < fmt.size: + raise IntegrityError('Invalid segment entry size {} - too small [segment {}, offset {}]'.format( + size, segment, offset)) length = size - fmt.size data = fd.read(length) if len(data) != length: