Merge pull request #1420 from enkore/f/recreate1.1rc1

recreate goals for 1.1rc1
2016-08-14 18:04:41 +02:00 · 2016-08-14 18:04:41 +02:00 · 6e9debb027
parent 774609cd9f f0e9a55ebf
commit 6e9debb027
7 changed files with 120 additions and 28 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -19,6 +19,7 @@ logger = create_logger()
 from . import xattr
 from .cache import ChunkListEntry
 from .chunker import Chunker
+from .compress import Compressor
 from .constants import *  # NOQA
 from .hashindex import ChunkIndex, ChunkIndexEntry
 from .helpers import Manifest
@ -1298,7 +1299,7 @@ class ArchiveRecreater:

    def __init__(self, repository, manifest, key, cache, matcher,
                 exclude_caches=False, exclude_if_present=None, keep_tag_files=False,
-                 chunker_params=None, compression=None, compression_files=None,
+                 chunker_params=None, compression=None, compression_files=None, always_recompress=False,
                 dry_run=False, stats=False, progress=False, file_status_printer=None):
        self.repository = repository
        self.key = key
@ -1312,6 +1313,7 @@ class ArchiveRecreater:

        self.chunker_params = chunker_params or CHUNKER_PARAMS
        self.recompress = bool(compression)
+        self.always_recompress = always_recompress
        self.compression = compression or CompressionSpec('none')
        self.seen_chunks = set()
        self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'),
@ -1329,10 +1331,10 @@ class ArchiveRecreater:
        self.interrupt = False
        self.errors = False

-    def recreate(self, archive_name, comment=None):
+    def recreate(self, archive_name, comment=None, target_name=None):
        assert not self.is_temporary_archive(archive_name)
        archive = self.open_archive(archive_name)
-        target, resume_from = self.create_target_or_resume(archive)
+        target, resume_from = self.create_target_or_resume(archive, target_name)
        if self.exclude_if_present or self.exclude_caches:
            self.matcher_add_tagged_dirs(archive)
        if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
@ -1342,7 +1344,8 @@ class ArchiveRecreater:
            self.process_items(archive, target, resume_from)
        except self.Interrupted as e:
            return self.save(archive, target, completed=False, metadata=e.metadata)
-        return self.save(archive, target, comment)
+        replace_original = target_name is None
+        return self.save(archive, target, comment, replace_original=replace_original)

    def process_items(self, archive, target, resume_from=None):
        matcher = self.matcher
@ -1404,7 +1407,6 @@ class ArchiveRecreater:

    def process_chunks(self, archive, target, item):
        """Return new chunk ID list for 'item'."""
-        # TODO: support --compression-from
        if not self.recompress and not target.recreate_rechunkify:
            for chunk_id, size, csize in item.chunks:
                self.cache.chunk_incref(chunk_id, target.stats)
@ -1412,13 +1414,22 @@ class ArchiveRecreater:
        new_chunks = self.process_partial_chunks(target)
        chunk_iterator = self.create_chunk_iterator(archive, target, item)
        consume(chunk_iterator, len(new_chunks))
+        compress = self.compression_decider1.decide(item.path)
        for chunk in chunk_iterator:
+            chunk.meta['compress'] = compress
            chunk_id = self.key.id_hash(chunk.data)
            if chunk_id in self.seen_chunks:
                new_chunks.append(self.cache.chunk_incref(chunk_id, target.stats))
            else:
-                # TODO: detect / skip / --always-recompress
-                chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=self.recompress)
+                compression_spec, chunk = self.key.compression_decider2.decide(chunk)
+                overwrite = self.recompress
+                if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
+                    # Check if this chunk is already compressed the way we want it
+                    old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
+                    if Compressor.detect(old_chunk.data).name == compression_spec['name']:
+                        # Stored chunk has the same compression we wanted
+                        overwrite = False
+                chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
                new_chunks.append((chunk_id, size, csize))
                self.seen_chunks.add(chunk_id)
                if self.recompress:
@ -1465,7 +1476,7 @@ class ArchiveRecreater:
        logger.debug('Copied %d chunks from a partially processed item', len(partial_chunks))
        return partial_chunks

-    def save(self, archive, target, comment=None, completed=True, metadata=None):
+    def save(self, archive, target, comment=None, completed=True, metadata=None, replace_original=True):
        """Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
        if self.dry_run:
            return completed
@ -1477,6 +1488,7 @@ class ArchiveRecreater:
                'cmdline': archive.metadata[b'cmdline'],
                'recreate_cmdline': sys.argv,
            })
+            if replace_original:
                archive.delete(Statistics(), progress=self.progress)
                target.rename(archive.name)
            if self.stats:
@ -1530,11 +1542,11 @@ class ArchiveRecreater:
        matcher.add(tag_files, True)
        matcher.add(tagged_dirs, False)

-    def create_target_or_resume(self, archive):
+    def create_target_or_resume(self, archive, target_name=None):
        """Create new target archive or resume from temporary archive, if it exists. Return archive, resume from path"""
        if self.dry_run:
            return self.FakeTargetArchive(), None
-        target_name = archive.name + '.recreate'
+        target_name = target_name or archive.name + '.recreate'
        resume = target_name in self.manifest.archives
        target, resume_from = None, None
        if resume:
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -957,6 +957,7 @@ class Archiver:
                                     exclude_caches=args.exclude_caches, exclude_if_present=args.exclude_if_present,
                                     keep_tag_files=args.keep_tag_files, chunker_params=args.chunker_params,
                                     compression=args.compression, compression_files=args.compression_files,
+                                     always_recompress=args.always_recompress,
                                     progress=args.progress, stats=args.stats,
                                     file_status_printer=self.print_file_status,
                                     dry_run=args.dry_run)
@ -968,8 +969,11 @@ class Archiver:
                if recreater.is_temporary_archive(name):
                    self.print_error('Refusing to work on temporary archive of prior recreate: %s', name)
                    return self.exit_code
-                recreater.recreate(name, args.comment)
+                recreater.recreate(name, args.comment, args.target)
            else:
+                if args.target is not None:
+                    self.print_error('--target: Need to specify single archive')
+                    return self.exit_code
                for archive in manifest.list_archive_infos(sort_by='ts'):
                    name = archive.name
                    if recreater.is_temporary_archive(name):
@ -2007,6 +2011,9 @@ class Archiver:
        as in "borg create". If PATHs are specified the resulting archive
        will only contain files from these PATHs.

+        Note that all paths in an archive are relative, therefore absolute patterns/paths
+        will *not* match (--exclude, --exclude-from, --compression-from, PATHs).
+
        --compression: all chunks seen will be stored using the given method.
        Due to how Borg stores compressed size information this might display
        incorrect information for archives that were not recreated at the same time.
@ -2035,6 +2042,8 @@ class Archiver:
        archive that is built during the operation exists at the same time at
        "<ARCHIVE>.recreate". The new archive will have a different archive ID.

+        With --target the original archive is not replaced, instead a new archive is created.
+
        When rechunking space usage can be substantial, expect at least the entire
        deduplicated size of the archives using the previous chunker params.
        When recompressing approximately 1 % of the repository size or 512 MB
@ -2080,6 +2089,10 @@ class Archiver:
                                   help='keep tag files of excluded caches/directories')

        archive_group = subparser.add_argument_group('Archive options')
+        archive_group.add_argument('--target', dest='target', metavar='TARGET', default=None,
+                                   type=archivename_validator(),
+                                   help='create a new archive with the name ARCHIVE, do not replace existing archive '
+                                        '(only applies for a single archive)')
        archive_group.add_argument('--comment', dest='comment', metavar='COMMENT', default=None,
                                   help='add a comment text to the archive')
        archive_group.add_argument('--timestamp', dest='timestamp',
@ -2098,6 +2111,9 @@ class Archiver:
                                        'zlib,0 .. zlib,9 == zlib (with level 0..9),\n'
                                        'lzma == lzma (default level 6),\n'
                                        'lzma,0 .. lzma,9 == lzma (with level 0..9).')
+        archive_group.add_argument('--always-recompress', dest='always_recompress', action='store_true',
+                                   help='always recompress chunks, don\'t skip chunks already compressed with the same'
+                                        'algorithm.')
        archive_group.add_argument('--compression-from', dest='compression_files',
                                   type=argparse.FileType('r'), action='append',
                                   metavar='COMPRESSIONCONFIG', help='read compression patterns from COMPRESSIONCONFIG, one per line')
--- a/src/borg/compress.pyx
+++ b/src/borg/compress.pyx
@ -6,6 +6,8 @@ except ImportError:

 from .helpers import Buffer

+API_VERSION = 2
+
 cdef extern from "lz4.h":
    int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
    int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
@ -194,9 +196,14 @@ class Compressor:
        return self.compressor.compress(data)

    def decompress(self, data):
+        compressor_cls = self.detect(data)
+        return compressor_cls(**self.params).decompress(data)
+
+    @staticmethod
+    def detect(data):
        hdr = bytes(data[:2])  # detect() does not work with memoryview
        for cls in COMPRESSOR_LIST:
            if cls.detect(hdr):
-                return cls(**self.params).decompress(data)
+                return cls
        else:
            raise ValueError('No decompressor for this data found: %r.', data[:2])
--- a/src/borg/helpers.py
+++ b/src/borg/helpers.py
@ -84,11 +84,13 @@ class PlaceholderError(Error):


 def check_extension_modules():
-    from . import platform
+    from . import platform, compress
    if hashindex.API_VERSION != 3:
        raise ExtensionModuleError
    if chunker.API_VERSION != 2:
        raise ExtensionModuleError
+    if compress.API_VERSION != 2:
+        raise ExtensionModuleError
    if crypto.API_VERSION != 3:
        raise ExtensionModuleError
    if platform.API_VERSION != 3:
--- a/src/borg/key.py
+++ b/src/borg/key.py
@ -105,9 +105,15 @@ class KeyBase:
    def encrypt(self, chunk):
        pass

-    def decrypt(self, id, data):
+    def decrypt(self, id, data, decompress=True):
        pass

+    def assert_id(self, id, data):
+        if id:
+            id_computed = self.id_hash(data)
+            if not compare_digest(id_computed, id):
+                raise IntegrityError('Chunk id verification failed')
+

 class PlaintextKey(KeyBase):
    TYPE = 0x02
@ -130,12 +136,14 @@ class PlaintextKey(KeyBase):
        chunk = self.compress(chunk)
        return b''.join([self.TYPE_STR, chunk.data])

-    def decrypt(self, id, data):
+    def decrypt(self, id, data, decompress=True):
        if data[0] != self.TYPE:
            raise IntegrityError('Invalid encryption envelope')
-        data = self.compressor.decompress(memoryview(data)[1:])
-        if id and sha256(data).digest() != id:
-            raise IntegrityError('Chunk id verification failed')
+        payload = memoryview(data)[1:]
+        if not decompress:
+            return Chunk(payload)
+        data = self.compressor.decompress(payload)
+        self.assert_id(id, data)
        return Chunk(data)


@ -166,7 +174,7 @@ class AESKeyBase(KeyBase):
        hmac = hmac_sha256(self.enc_hmac_key, data)
        return b''.join((self.TYPE_STR, hmac, data))

-    def decrypt(self, id, data):
+    def decrypt(self, id, data, decompress=True):
        if not (data[0] == self.TYPE or
            data[0] == PassphraseKey.TYPE and isinstance(self, RepoKey)):
            raise IntegrityError('Invalid encryption envelope')
@ -176,12 +184,11 @@ class AESKeyBase(KeyBase):
        if not compare_digest(hmac_computed, hmac_given):
            raise IntegrityError('Encryption envelope checksum mismatch')
        self.dec_cipher.reset(iv=PREFIX + data[33:41])
-        data = self.compressor.decompress(self.dec_cipher.decrypt(data_view[41:]))
-        if id:
-            hmac_given = id
-            hmac_computed = hmac_sha256(self.id_key, data)
-            if not compare_digest(hmac_computed, hmac_given):
-                raise IntegrityError('Chunk id verification failed')
+        payload = self.dec_cipher.decrypt(data_view[41:])
+        if not decompress:
+            return Chunk(payload)
+        data = self.compressor.decompress(payload)
+        self.assert_id(id, data)
        return Chunk(data)

    def extract_nonce(self, payload):
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -1522,6 +1522,28 @@ class ArchiverTestCase(ArchiverTestCaseBase):
            self.cmd('init', self.repository_location, exit_code=1)
        assert not os.path.exists(self.repository_location)

+    def test_recreate_target_rc(self):
+        self.cmd('init', self.repository_location)
+        output = self.cmd('recreate', self.repository_location, '--target=asdf', exit_code=2)
+        assert 'Need to specify single archive' in output
+
+    def test_recreate_target(self):
+        self.create_test_files()
+        self.cmd('init', self.repository_location)
+        archive = self.repository_location + '::test0'
+        self.cmd('create', archive, 'input')
+        original_archive = self.cmd('list', self.repository_location)
+        self.cmd('recreate', archive, 'input/dir2', '-e', 'input/dir2/file3', '--target=new-archive')
+        archives = self.cmd('list', self.repository_location)
+        assert original_archive in archives
+        assert 'new-archive' in archives
+
+        archive = self.repository_location + '::new-archive'
+        listing = self.cmd('list', '--short', archive)
+        assert 'file1' not in listing
+        assert 'dir2/file2' in listing
+        assert 'dir2/file3' not in listing
+
    def test_recreate_basic(self):
        self.create_test_files()
        self.create_regular_file('dir2/file3', size=1024 * 80)
--- a/src/borg/testsuite/key.py
+++ b/src/borg/testsuite/key.py
@ -43,6 +43,14 @@ class TestKey:
        monkeypatch.setenv('BORG_KEYS_DIR', tmpdir)
        return tmpdir

+    @pytest.fixture(params=(
+        KeyfileKey,
+        PlaintextKey
+    ))
+    def key(self, request, monkeypatch):
+        monkeypatch.setenv('BORG_PASSPHRASE', 'test')
+        return request.param.create(self.MockRepository(), self.MockArgs())
+
    class MockRepository:
        class _Location:
            orig = '/some/place'
@ -155,6 +163,24 @@ class TestKey:
            id[12] = 0
            key.decrypt(id, data)

+    def test_decrypt_decompress(self, key):
+        plaintext = Chunk(b'123456789')
+        encrypted = key.encrypt(plaintext)
+        assert key.decrypt(None, encrypted, decompress=False) != plaintext
+        assert key.decrypt(None, encrypted) == plaintext
+
+    def test_assert_id(self, key):
+        plaintext = b'123456789'
+        id = key.id_hash(plaintext)
+        key.assert_id(id, plaintext)
+        id_changed = bytearray(id)
+        id_changed[0] += 1
+        with pytest.raises(IntegrityError):
+            key.assert_id(id_changed, plaintext)
+        plaintext_changed = plaintext + b'1'
+        with pytest.raises(IntegrityError):
+            key.assert_id(id, plaintext_changed)
+

 class TestPassphrase:
    def test_passphrase_new_verification(self, capsys, monkeypatch):