diff --git a/docs/changes.rst b/docs/changes.rst index 70cf34129..95e1c612a 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -130,6 +130,14 @@ Security fixes: - fix security issue with remote repository access, #1428 +Bug fixes: + +- do not write objects to repository that are bigger than the allowed size, + borg will reject reading them, #1451. + IMPORTANT: if you created archives with many millions of files or + directories, please verify if you can open them successfully, + e.g. try a "borg list REPO::ARCHIVE". + Version 1.0.7rc1 (2016-08-05) ----------------------------- diff --git a/docs/usage/debug-dump-repo-objs.rst.inc b/docs/usage/debug-dump-repo-objs.rst.inc new file mode 100644 index 000000000..4fcd45ae8 --- /dev/null +++ b/docs/usage/debug-dump-repo-objs.rst.inc @@ -0,0 +1,38 @@ +.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit! + +.. _borg_debug-dump-repo-objs: + +borg debug-dump-repo-objs +------------------------- +:: + + usage: borg debug-dump-repo-objs [-h] [--critical] [--error] [--warning] + [--info] [--debug] [--lock-wait N] + [--show-rc] [--no-files-cache] [--umask M] + [--remote-path PATH] + REPOSITORY + + dump (decrypted, decompressed) repo objects + + positional arguments: + REPOSITORY repo to dump + + optional arguments: + -h, --help show this help message and exit + --critical work on log level CRITICAL + --error work on log level ERROR + --warning work on log level WARNING (default) + --info, -v, --verbose + work on log level INFO + --debug work on log level DEBUG + --lock-wait N wait for the lock, but max. N seconds (default: 1). + --show-rc show/log the return code (rc) + --no-files-cache do not load/update the file metadata cache used to + detect unchanged files + --umask M set umask to M (local and remote, default: 0077) + --remote-path PATH set remote path to executable (default: "borg") + +Description +~~~~~~~~~~~ + +This command dumps raw (but decrypted and decompressed) repo objects to files. diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 3bb88def7..13955a86b 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -1,3 +1,4 @@ +import threading import zlib try: import lzma @@ -7,6 +8,18 @@ except ImportError: cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil + int LZ4_compressBound(int inputSize) nogil + + +thread_local = threading.local() +thread_local.buffer = bytes() + + +cdef char *get_buffer(size): + size = int(size) + if len(thread_local.buffer) < size: + thread_local.buffer = bytes(size) + return thread_local.buffer cdef class CompressorBase: @@ -52,40 +65,30 @@ class CNONE(CompressorBase): return data -cdef class LZ4(CompressorBase): +class LZ4(CompressorBase): """ raw LZ4 compression / decompression (liblz4). Features: - lz4 is super fast - wrapper releases CPython's GIL to support multithreaded code - - buffer given by caller, avoiding frequent reallocation and buffer duplication - uses safe lz4 methods that never go beyond the end of the output buffer - - But beware: - - this is not very generic, the given buffer MUST be large enough to - handle all compression or decompression output (or it will fail). - - you must not do method calls to the same LZ4 instance from different - threads at the same time - create one LZ4 instance per thread! """ ID = b'\x01\x00' name = 'lz4' - cdef char *buffer # helper buffer for (de)compression output - cdef int bufsize # size of this buffer - - def __cinit__(self, **kwargs): - buffer = kwargs['buffer'] - self.buffer = buffer - self.bufsize = len(buffer) + def __init__(self, **kwargs): + pass def compress(self, idata): if not isinstance(idata, bytes): idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) - cdef int osize = self.bufsize + cdef int osize cdef char *source = idata - cdef char *dest = self.buffer + cdef char *dest + osize = LZ4_compressBound(isize) + dest = get_buffer(osize) with nogil: osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: @@ -97,15 +100,25 @@ cdef class LZ4(CompressorBase): idata = bytes(idata) # code below does not work with memoryview idata = super().decompress(idata) cdef int isize = len(idata) - cdef int osize = self.bufsize + cdef int osize + cdef int rsize cdef char *source = idata - cdef char *dest = self.buffer - with nogil: - osize = LZ4_decompress_safe(source, dest, isize, osize) - if osize < 0: - # malformed input data, buffer too small, ... - raise Exception('lz4 decompress failed') - return dest[:osize] + cdef char *dest + # a bit more than 8MB is enough for the usual data sizes yielded by the chunker. + # allocate more if isize * 3 is already bigger, to avoid having to resize often. + osize = max(int(1.1 * 2**23), isize * 3) + while True: + dest = get_buffer(osize) + with nogil: + rsize = LZ4_decompress_safe(source, dest, isize, osize) + if rsize >= 0: + break + if osize > 2 ** 30: + # this is insane, get out of here + raise Exception('lz4 decompress failed') + # likely the buffer was too small, get a bigger one: + osize = int(1.5 * osize) + return dest[:rsize] class LZMA(CompressorBase): @@ -192,8 +205,3 @@ class Compressor: return cls(**self.params).decompress(data) else: raise ValueError('No decompressor for this data found: %r.', data[:2]) - - -# a buffer used for (de)compression result, which can be slightly bigger -# than the chunk buffer in the worst (incompressible data) case, add 10%: -COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23 diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 450f33648..1761b5d7f 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -38,7 +38,7 @@ from . import crypto from . import hashindex from . import shellpattern from .constants import * # NOQA -from .compress import COMPR_BUFFER, get_compressor +from .compress import get_compressor # meta dict, data bytes _Chunk = namedtuple('_Chunk', 'meta data') @@ -470,8 +470,6 @@ def ChunkerParams(s): return CHUNKER_PARAMS chunk_min, chunk_max, chunk_mask, window_size = s.split(',') if int(chunk_max) > 23: - # do not go beyond 2**23 (8MB) chunk size now, - # COMPR_BUFFER can only cope with up to this size raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)') return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size) @@ -1538,16 +1536,14 @@ class CompressionDecider2: # if we compress the data here to decide, we can even update the chunk data # and modify the metadata as desired. compr_spec = chunk.meta.get('compress', self.compression) - compr_args = dict(buffer=COMPR_BUFFER) - compr_args.update(compr_spec) - if compr_args['name'] == 'auto': + if compr_spec['name'] == 'auto': # we did not decide yet, use heuristic: - compr_args, chunk = self.heuristic_lz4(compr_args, chunk) - return compr_args, chunk + compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk) + return compr_spec, chunk def heuristic_lz4(self, compr_args, chunk): meta, data = chunk - lz4 = get_compressor('lz4', buffer=compr_args['buffer']) + lz4 = get_compressor('lz4') cdata = lz4.compress(data) data_len = len(data) cdata_len = len(cdata) diff --git a/src/borg/key.py b/src/borg/key.py index b122b638c..dd64b0fd1 100644 --- a/src/borg/key.py +++ b/src/borg/key.py @@ -13,7 +13,7 @@ from .logger import create_logger logger = create_logger() from .constants import * # NOQA -from .compress import Compressor, COMPR_BUFFER, get_compressor +from .compress import Compressor, get_compressor from .crypto import AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks, hmac_sha256 from .helpers import Chunk from .helpers import Error, IntegrityError @@ -89,7 +89,7 @@ class KeyBase: self.repository = repository self.target = None # key location file path / repo obj self.compression_decider2 = CompressionDecider2(CompressionSpec('none')) - self.compressor = Compressor('none', buffer=COMPR_BUFFER) # for decompression + self.compressor = Compressor('none') # for decompression def id_hash(self, data): """Return HMAC hash using the "id" HMAC key diff --git a/src/borg/repository.py b/src/borg/repository.py index 0dcee680c..c4923f154 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -909,9 +909,14 @@ class LoggedIO: key = None else: raise TypeError("_read called with unsupported format") - if size > MAX_OBJECT_SIZE or size < fmt.size: - raise IntegrityError('Invalid segment entry size [segment {}, offset {}]'.format( - segment, offset)) + if size > MAX_OBJECT_SIZE: + # if you get this on an archive made with borg < 1.0.7 and millions of files and + # you need to restore it, you can disable this check by using "if False:" above. + raise IntegrityError('Invalid segment entry size {} - too big [segment {}, offset {}]'.format( + size, segment, offset)) + if size < fmt.size: + raise IntegrityError('Invalid segment entry size {} - too small [segment {}, offset {}]'.format( + size, segment, offset)) length = size - fmt.size if read_data: data = fd.read(length) @@ -942,8 +947,12 @@ class LoggedIO: return size, tag, key, data def write_put(self, id, data, raise_full=False): + data_size = len(data) + if data_size > MAX_DATA_SIZE: + # this would push the segment entry size beyond MAX_OBJECT_SIZE. + raise IntegrityError('More than allowed put data [{} > {}]'.format(data_size, MAX_DATA_SIZE)) fd = self.get_write_fd(raise_full=raise_full) - size = len(data) + self.put_header_fmt.size + size = data_size + self.put_header_fmt.size offset = self.offset header = self.header_no_crc_fmt.pack(size, TAG_PUT) crc = self.crc_fmt.pack(crc32(data, crc32(id, crc32(header))) & 0xffffffff) @@ -972,3 +981,6 @@ class LoggedIO: crc = self.crc_fmt.pack(crc32(header) & 0xffffffff) fd.write(b''.join((crc, header))) self.close_segment() + + +MAX_DATA_SIZE = MAX_OBJECT_SIZE - LoggedIO.put_header_fmt.size diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index 1a4353583..ff9d42713 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -1,3 +1,4 @@ +import os import zlib try: import lzma @@ -11,13 +12,13 @@ from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4 buffer = bytes(2**16) data = b'fooooooooobaaaaaaaar' * 10 -params = dict(name='zlib', level=6, buffer=buffer) +params = dict(name='zlib', level=6) def test_get_compressor(): c = get_compressor(name='none') assert isinstance(c, CNONE) - c = get_compressor(name='lz4', buffer=buffer) + c = get_compressor(name='lz4') assert isinstance(c, LZ4) c = get_compressor(name='zlib') assert isinstance(c, ZLIB) @@ -35,13 +36,21 @@ def test_cnull(): def test_lz4(): - c = get_compressor(name='lz4', buffer=buffer) + c = get_compressor(name='lz4') cdata = c.compress(data) assert len(cdata) < len(data) assert data == c.decompress(cdata) assert data == Compressor(**params).decompress(cdata) # autodetect +def test_lz4_buffer_allocation(): + # test with a rather huge data object to see if buffer allocation / resizing works + data = os.urandom(50 * 2**20) # 50MiB incompressible data + c = get_compressor(name='lz4') + cdata = c.compress(data) + assert data == c.decompress(cdata) + + def test_zlib(): c = get_compressor(name='zlib') cdata = c.compress(data) @@ -83,16 +92,16 @@ def test_zlib_compat(): def test_compressor(): params_list = [ - dict(name='none', buffer=buffer), - dict(name='lz4', buffer=buffer), - dict(name='zlib', level=0, buffer=buffer), - dict(name='zlib', level=6, buffer=buffer), - dict(name='zlib', level=9, buffer=buffer), + dict(name='none'), + dict(name='lz4'), + dict(name='zlib', level=0), + dict(name='zlib', level=6), + dict(name='zlib', level=9), ] if lzma: params_list += [ - dict(name='lzma', level=0, buffer=buffer), - dict(name='lzma', level=6, buffer=buffer), + dict(name='lzma', level=0), + dict(name='lzma', level=6), # we do not test lzma on level 9 because of the huge memory needs ] for params in params_list: diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index 613e46aec..0fc1461de 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -13,7 +13,7 @@ from ..helpers import Location from ..helpers import IntegrityError from ..locking import Lock, LockFailed from ..remote import RemoteRepository, InvalidRPCMethod, ConnectionClosedWithHint, handle_remote_line -from ..repository import Repository, LoggedIO, MAGIC +from ..repository import Repository, LoggedIO, MAGIC, MAX_DATA_SIZE from . import BaseTestCase @@ -142,6 +142,13 @@ class RepositoryTestCase(RepositoryTestCaseBase): self.assert_equal(second_half, all[50:]) self.assert_equal(len(self.repository.list(limit=50)), 50) + def test_max_data_size(self): + max_data = b'x' * MAX_DATA_SIZE + self.repository.put(b'00000000000000000000000000000000', max_data) + self.assert_equal(self.repository.get(b'00000000000000000000000000000000'), max_data) + self.assert_raises(IntegrityError, + lambda: self.repository.put(b'00000000000000000000000000000001', max_data + b'x')) + class LocalRepositoryTestCase(RepositoryTestCaseBase): # test case that doesn't work with remote repositories