Merge pull request #1921 from enkore/f/recreate-fixes

recreate fixes
2016-12-03 00:02:01 +01:00 · 2016-12-03 00:02:01 +01:00 · e9d7f928e2
parent 91a547bbbe a9395dd8b1
commit e9d7f928e2
3 changed files with 26 additions and 35 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1394,10 +1394,6 @@ class ArchiveChecker:
 class ArchiveRecreater:
    class FakeTargetArchive:
        def __init__(self):
            self.stats = Statistics()
    class Interrupted(Exception):
        def __init__(self, metadata=None):
            self.metadata = metadata or {}
@ -1421,6 +1417,9 @@ class ArchiveRecreater:
        self.exclude_if_present = exclude_if_present or []
        self.keep_tag_files = keep_tag_files
        self.rechunkify = chunker_params is not None
        if self.rechunkify:
            logger.debug('Rechunking archives to %s', chunker_params)
        self.chunker_params = chunker_params or CHUNKER_PARAMS
        self.recompress = bool(compression)
        self.always_recompress = always_recompress
@ -1434,7 +1433,7 @@ class ArchiveRecreater:
        self.stats = stats
        self.progress = progress
        self.print_file_status = file_status_printer or (lambda *args: None)
-        self.checkpoint_interval = checkpoint_interval
+        self.checkpoint_interval = None if dry_run else checkpoint_interval
    def recreate(self, archive_name, comment=None, target_name=None):
        assert not self.is_temporary_archive(archive_name)
@ -1444,10 +1443,10 @@ class ArchiveRecreater:
            self.matcher_add_tagged_dirs(archive)
        if self.matcher.empty() and not self.recompress and not target.recreate_rechunkify and comment is None:
            logger.info("Skipping archive %s, nothing to do", archive_name)
-            return True
+            return
        self.process_items(archive, target)
        replace_original = target_name is None
-        return self.save(archive, target, comment, replace_original=replace_original)
+        self.save(archive, target, comment, replace_original=replace_original)
    def process_items(self, archive, target):
        matcher = self.matcher
@ -1494,12 +1493,11 @@ class ArchiveRecreater:
        self.print_file_status(file_status(item.mode), item.path)
    def process_chunks(self, archive, target, item):
        """Return new chunk ID list for 'item'."""
        if not self.recompress and not target.recreate_rechunkify:
            for chunk_id, size, csize in item.chunks:
                self.cache.chunk_incref(chunk_id, target.stats)
            return item.chunks
-        chunk_iterator = self.create_chunk_iterator(archive, target, list(item.chunks))
+        chunk_iterator = self.iter_chunks(archive, target, list(item.chunks))
        compress = self.compression_decider1.decide(item.path)
        chunk_processor = partial(self.chunk_processor, target, compress)
        target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor)
@ -1517,24 +1515,22 @@ class ArchiveRecreater:
            if Compressor.detect(old_chunk.data).name == compression_spec['name']:
                # Stored chunk has the same compression we wanted
                overwrite = False
-        chunk_id, size, csize = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
+        chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite)
-        self.seen_chunks.add(chunk_id)
+        self.seen_chunks.add(chunk_entry.id)
-        return chunk_id, size, csize
+        return chunk_entry
-    def create_chunk_iterator(self, archive, target, chunks):
+    def iter_chunks(self, archive, target, chunks):
        """Return iterator of chunks to store for 'item' from 'archive' in 'target'."""
        chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _, _ in chunks])
        if target.recreate_rechunkify:
            # The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
            # (does not load the entire file into memory)
            file = ChunkIteratorFileWrapper(chunk_iterator)
-            return target.chunker.chunkify(file)
+            yield from target.chunker.chunkify(file)
        else:
            for chunk in chunk_iterator:
                yield chunk.data
    def save(self, archive, target, comment=None, replace_original=True):
        """Save target archive. If completed, replace source. If not, save temporary with additional 'metadata' dict."""
        if self.dry_run:
            return
        timestamp = archive.ts.replace(tzinfo=None)
@ -1591,12 +1587,13 @@ class ArchiveRecreater:
    def create_target(self, archive, target_name=None):
        """Create target archive."""
        if self.dry_run:
            return self.FakeTargetArchive(), None
        target_name = target_name or archive.name + '.recreate'
        target = self.create_target_archive(target_name)
        # If the archives use the same chunker params, then don't rechunkify
-        target.recreate_rechunkify = tuple(archive.metadata.get('chunker_params', [])) != self.chunker_params
+        source_chunker_params = tuple(archive.metadata.get('chunker_params', []))
        target.recreate_rechunkify = self.rechunkify and source_chunker_params != target.chunker_params
        if target.recreate_rechunkify:
            logger.debug('Rechunking archive from %s to %s', source_chunker_params or '(unknown)', target.chunker_params)
        return target
    def create_target_archive(self, name):
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -1101,11 +1101,11 @@ class Archiver:
                if recreater.is_temporary_archive(name):
                    continue
                print('Processing', name)
-                if not recreater.recreate(name, args.comment):
+                recreater.recreate(name, args.comment)
-                    break
+        if not args.dry_run:
-        manifest.write()
+            manifest.write()
-        repository.commit()
+            repository.commit()
-        cache.commit()
+            cache.commit()
        return self.exit_code
    @with_repository(manifest=False, exclusive=True)
@ -2356,6 +2356,8 @@ class Archiver:
        recreate_epilog = textwrap.dedent("""
        Recreate the contents of existing archives.
        This is an *experimental* feature. Do *not* use this on your only backup.
        --exclude, --exclude-from and PATH have the exact same semantics
        as in "borg create". If PATHs are specified the resulting archive
        will only contain files from these PATHs.
@ -2372,15 +2374,6 @@ class Archiver:
        used to have upgraded Borg 0.xx or Attic archives deduplicate with
        Borg 1.x archives.
        borg recreate is signal safe. Send either SIGINT (Ctrl-C on most terminals) or
        SIGTERM to request termination.
        Use the *exact same* command line to resume the operation later - changing excludes
        or paths will lead to inconsistencies (changed excludes will only apply to newly
        processed files/dirs). Changing compression leads to incorrect size information
        (which does not cause any data loss, but can be misleading).
        Changing chunker params between invocations might lead to data loss.
        USE WITH CAUTION.
        Depending on the PATHs and patterns given, recreate can be used to permanently
        delete files from archives.
@ -2395,8 +2388,8 @@ class Archiver:
        When rechunking space usage can be substantial, expect at least the entire
        deduplicated size of the archives using the previous chunker params.
-        When recompressing approximately 1 % of the repository size or 512 MB
+        When recompressing expect approx. (throughput / checkpoint-interval) in space usage,
-        (whichever is greater) of additional space is used.
+        assuming all chunks are recompressed.
        """)
        subparser = subparsers.add_parser('recreate', parents=[common_parser], add_help=False,
                                          description=self.do_recreate.__doc__,
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -1823,6 +1823,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
        self.cmd('recreate', self.repository_location, '--chunker-params', 'default')
        self.check_cache()
        # test1 and test2 do deduplicate after recreate
        assert int(self.cmd('list', self.repository_location + '::test1', 'input/large_file', '--format={size}'))
        assert not int(self.cmd('list', self.repository_location + '::test1', 'input/large_file',
                                '--format', '{unique_chunks}'))