From 116f67036f503405b80a4d8baa429a6166490b80 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 3 May 2022 03:01:03 +0200
Subject: [PATCH] transfer: copy archives from another repo

this is somehow similar to borg recreate,
but with different focus and way simpler:

not changing compression algo
not changing chunking
not excluding files inside an archive by path match
only dealing with complete archives

but:
different src and dst repo
only reading each chunk once
keeping the compressed payload (no decompression/recompression effort)
--dry-run can be used before and afterwards to check
---
 src/borg/archiver.py | 105 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index de9489ff0..49516fff6 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -338,6 +338,74 @@ class Archiver:
         ).serve()
         return EXIT_SUCCESS
 
+    @with_other_repository(manifest=True, key=True, compatibility=(Manifest.Operation.READ,))
+    @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,))
+    def do_transfer(self, args, *,
+               repository, manifest, key, cache,
+               other_repository=None, other_manifest=None, other_key=None):
+        """archives transfer from other repository"""
+        dry_run = args.dry_run
+
+        args.consider_checkpoints = True
+        archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args))
+        if not archive_names:
+            return EXIT_SUCCESS
+
+        for name in archive_names:
+            transfer_size = 0
+            present_size = 0
+            if name in manifest.archives and not dry_run:
+                print(f"{name}: archive is already present in destination repo, skipping.")
+            else:
+                if not dry_run:
+                    print(f"{name}: copying archive to destination repo...")
+                other_archive = Archive(other_repository, other_key, other_manifest, name)
+                archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None
+                for item in other_archive.iter_items():
+                    if 'chunks' in item:
+                        chunks = []
+                        for chunk_id, size, _ in item.chunks:
+                            refcount = cache.seen_chunk(chunk_id, size)
+                            if refcount == 0:  # target repo does not yet have this chunk
+                                if not dry_run:
+                                    cdata = other_repository.get(chunk_id)
+                                    # keep compressed payload same, avoid decompression / recompression
+                                    data = other_key.decrypt(chunk_id, cdata, decompress=False)
+                                    chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False,
+                                                                  compress=False, size=size)
+                                    cache.repository.async_response(wait=False)
+                                    chunks.append(chunk_entry)
+                                transfer_size += size
+                            else:
+                                if not dry_run:
+                                    chunk_entry = cache.chunk_incref(chunk_id, archive.stats)
+                                    chunks.append(chunk_entry)
+                                present_size += size
+                        if not dry_run:
+                            item.chunks = chunks  # overwrite! IDs and sizes are same, csizes are likely different
+                            archive.stats.nfiles += 1
+                    # TODO: filter the item data, get rid of legacy crap
+                    if not dry_run:
+                        archive.add_item(item)
+                if not dry_run:
+                    additional_metadata = {}
+                    # keep all metadata except archive version and stats. also do not keep
+                    # recreate_source_id, recreate_args, recreate_partial_chunks which were used only in 1.1.0b1 .. b2.
+                    for attr in ('cmdline', 'hostname', 'username', 'time', 'time_end', 'comment',
+                                 'chunker_params', 'recreate_cmdline'):
+                        if hasattr(other_archive.metadata, attr):
+                            additional_metadata[attr] = getattr(other_archive.metadata, attr)
+                    archive.save(stats=archive.stats, additional_metadata=additional_metadata)
+                    print(f"{name}: finished. "
+                          f"transfer_size: {format_file_size(transfer_size)} "
+                          f"present_size: {format_file_size(present_size)}")
+                else:
+                    print(f"{name}: completed" if transfer_size == 0 else
+                          f"{name}: incomplete, "
+                          f"transfer_size: {format_file_size(transfer_size)} "
+                          f"present_size: {format_file_size(present_size)}")
+        return EXIT_SUCCESS
+
     @with_repository(create=True, exclusive=True, manifest=False)
     @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, ))
     def do_init(self, args, repository, *, other_repository=None, other_key=None):
@@ -4083,6 +4151,43 @@ class Archiver:
                                help='archives to delete')
         define_archive_filters_group(subparser)
 
+        # borg transfer
+        transfer_epilog = process_epilog("""
+        This command transfers archives from one repository to another repository.
+
+        Suggested use:
+
+        # initialize DST_REPO reusing key material from SRC_REPO, so that
+        # chunking and chunk id generation will work in the same way as before.
+        borg init --other-location=SRC_REPO --encryption=DST_ENC DST_REPO
+
+        # transfer archives from SRC_REPO to DST_REPO
+        borg transfer --dry-run SRC_REPO DST_REPO  # check what it would do
+        borg transfer           SRC_REPO DST_REPO  # do it!
+        borg transfer --dry-run SRC_REPO DST_REPO  # check! anything left?
+
+        The default is to transfer all archives, including checkpoint archives.
+
+        You could use the misc. archive filter options to limit which archives it will
+        transfer, e.g. using the --prefix option. This is recommended for big
+        repositories with multiple data sets to keep the runtime per invocation lower.
+        """)
+        subparser = subparsers.add_parser('transfer', parents=[common_parser], add_help=False,
+                                          description=self.do_transfer.__doc__,
+                                          epilog=transfer_epilog,
+                                          formatter_class=argparse.RawDescriptionHelpFormatter,
+                                          help='transfer of archives from another repository')
+        subparser.set_defaults(func=self.do_transfer)
+        subparser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true',
+                               help='do not change repository, just check')
+        subparser.add_argument('other_location', metavar='SRC_REPOSITORY',
+                               type=location_validator(archive=False, other=True),
+                               help='source repository')
+        subparser.add_argument('location', metavar='DST_REPOSITORY',
+                               type=location_validator(archive=False, other=False),
+                               help='destination repository')
+        define_archive_filters_group(subparser)
+
         # borg diff
         diff_epilog = process_epilog("""
             This command finds differences (file contents, user/group/mode) between archives.