From c2ed0d9ee0325bf0b7be60b943ed05d3301e53a9 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 17 May 2018 21:59:07 +0200
Subject: [PATCH 1/4] implement borg debug search-repo-objs searchterm

---
 src/borg/archiver.py | 82 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 8d64c53b2..d1a699f65 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -1956,6 +1956,73 @@ class Archiver:
         print('Done.')
         return EXIT_SUCCESS
 
+    @with_repository(manifest=False)
+    def do_debug_search_repo_objs(self, args, repository):
+        """search for byte sequences in repo objects, repo index MUST be current/correct"""
+        context = 32
+
+        def print_finding(info, wanted, data, offset):
+            before = data[offset - context:offset]
+            after = data[offset + len(wanted):offset + len(wanted) + context]
+            print('%s: %s %s %s == %r %r %r' % (info, before.hex(), wanted.hex(), after.hex(),
+                                                before, wanted, after))
+
+        wanted = args.wanted
+        try:
+            if wanted.startswith('hex:'):
+                wanted = unhexlify(wanted[4:])
+            elif wanted.startswith('str:'):
+                wanted = wanted[4:].encode('utf-8')
+            else:
+                raise ValueError('unsupported search term')
+        except (ValueError, UnicodeEncodeError):
+            wanted = None
+        if not wanted:
+            self.print_error('search term needs to be hex:123abc or str:foobar style')
+            return EXIT_ERROR
+
+        from .crypto.key import key_factory
+        # set up the key without depending on a manifest obj
+        ids = repository.list(limit=1, marker=None)
+        cdata = repository.get(ids[0])
+        key = key_factory(repository, cdata)
+
+        marker = None
+        last_data = b''
+        last_id = None
+        i = 0
+        while True:
+            result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker)  # must use on-disk order scanning here
+            if not result:
+                break
+            marker = result[-1]
+            for id in result:
+                cdata = repository.get(id)
+                give_id = id if id != Manifest.MANIFEST_ID else None
+                data = key.decrypt(give_id, cdata)
+
+                # try to locate wanted sequence crossing the border of last_data and data
+                boundary_data = last_data[-(len(wanted) - 1):] + data[:len(wanted) - 1]
+                if wanted in boundary_data:
+                    boundary_data = last_data[-(len(wanted) - 1 + context):] + data[:len(wanted) - 1 + context]
+                    offset = boundary_data.find(wanted)
+                    info = '%d %s | %s' % (i, last_id.hex(), id.hex())
+                    print_finding(info, wanted, boundary_data, offset)
+
+                # try to locate wanted sequence in data
+                count = data.count(wanted)
+                if count:
+                    offset = data.find(wanted)  # only determine first occurance's offset
+                    info = "%d %s #%d" % (i, id.hex(), count)
+                    print_finding(info, wanted, data, offset)
+
+                last_id, last_data = id, data
+                i += 1
+                if i % 10000 == 0:
+                    print('%d objects processed.' % i)
+        print('Done.')
+        return EXIT_SUCCESS
+
     @with_repository(manifest=False)
     def do_debug_get_obj(self, args, repository):
         """get object contents from the repository and write it into file"""
@@ -3996,6 +4063,21 @@ class Archiver:
                                type=location_validator(archive=False),
                                help='repo to dump')
 
+        debug_search_repo_objs_epilog = process_epilog("""
+        This command searches raw (but decrypted and decompressed) repo objects for a specific bytes sequence.
+        """)
+        subparser = debug_parsers.add_parser('search-repo-objs', parents=[common_parser], add_help=False,
+                                          description=self.do_debug_search_repo_objs.__doc__,
+                                          epilog=debug_search_repo_objs_epilog,
+                                          formatter_class=argparse.RawDescriptionHelpFormatter,
+                                          help='search repo objects (debug)')
+        subparser.set_defaults(func=self.do_debug_search_repo_objs)
+        subparser.add_argument('location', metavar='REPOSITORY',
+                               type=location_validator(archive=False),
+                               help='repo to search')
+        subparser.add_argument('wanted', metavar='WANTED', type=str,
+                               help='term to search the repo for, either 0x1234abcd hex term or a string')
+
         debug_get_obj_epilog = process_epilog("""
         This command gets an object from the repository.
         """)

From 2b7d65b0243567c66d80a616683684d892995102 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 21 May 2018 00:27:26 +0200
Subject: [PATCH 2/4] use repository.scan() for dump-repo-objs to get on-disk
 order

it is also more efficient, avoids random access.
---
 src/borg/archiver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index d1a699f65..108b19476 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -1930,7 +1930,7 @@ class Archiver:
 
     @with_repository(manifest=False)
     def do_debug_dump_repo_objs(self, args, repository):
-        """dump (decrypted, decompressed) repo objects"""
+        """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct"""
         from .crypto.key import key_factory
         # set up the key without depending on a manifest obj
         ids = repository.list(limit=1, marker=None)
@@ -1940,7 +1940,7 @@ class Archiver:
         marker = None
         i = 0
         while True:
-            result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker)
+            result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker)  # must use on-disk order scanning here
             if not result:
                 break
             marker = result[-1]

From 8738e85967c3d6e515cc203bfa4f80958ff0a05e Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Wed, 30 May 2018 18:43:04 +0200
Subject: [PATCH 3/4] implement borg debug dump-repo-objs --ghost

intended as a last resort measure to export all segment file contents
in a relatively easy to use format.

if you want to dig into a damaged repo (e.g. missing segment files,
missing commits) and you know what you do.

note: dump-repo-objs --ghost must not use repo.list()

because this would need the repo index and call get_transaction_id and
check_transaction methods, which can easily fail on a damaged repo.

thus we use the same low level scan method as we use anyway to get
some encrypted piece of data to setup the decryption "key".
---
 src/borg/archiver.py   | 66 ++++++++++++++++++++++++++++++------------
 src/borg/repository.py | 17 +++++++++++
 2 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 108b19476..470862244 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -74,7 +74,7 @@ from .patterns import PatternMatcher
 from .item import Item
 from .platform import get_flags, get_process_id, SyncFile
 from .remote import RepositoryServer, RemoteRepository, cache_if_remote
-from .repository import Repository, LIST_SCAN_LIMIT
+from .repository import Repository, LIST_SCAN_LIMIT, TAG_PUT, TAG_DELETE, TAG_COMMIT
 from .selftest import selftest
 from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
 
@@ -1932,27 +1932,55 @@ class Archiver:
     def do_debug_dump_repo_objs(self, args, repository):
         """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct"""
         from .crypto.key import key_factory
-        # set up the key without depending on a manifest obj
-        ids = repository.list(limit=1, marker=None)
-        cdata = repository.get(ids[0])
-        key = key_factory(repository, cdata)
 
-        marker = None
-        i = 0
-        while True:
-            result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker)  # must use on-disk order scanning here
-            if not result:
-                break
-            marker = result[-1]
-            for id in result:
-                cdata = repository.get(id)
+        def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None):
+            if cdata is not None:
                 give_id = id if id != Manifest.MANIFEST_ID else None
                 data = key.decrypt(give_id, cdata)
-                filename = '%06d_%s.obj' % (i, bin_to_hex(id))
-                print('Dumping', filename)
-                with open(filename, 'wb') as fd:
-                    fd.write(data)
+            else:
+                data = b''
+            tag_str = '' if tag is None else '_' + tag
+            segment_str = '_' + str(segment) if segment is not None else ''
+            offset_str = '_' + str(offset) if offset is not None else ''
+            id_str = '_' + bin_to_hex(id) if id is not None else ''
+            filename = '%06d%s%s%s%s.obj' % (i, tag_str, segment_str, offset_str, id_str)
+            print('Dumping', filename)
+            with open(filename, 'wb') as fd:
+                fd.write(data)
+
+        if args.ghost:
+            # dump ghosty stuff from segment files: not yet committed objects, deleted / superceded objects, commit tags
+
+            # set up the key without depending on a manifest obj
+            for id, cdata, tag, segment, offset in repository.scan_low_level():
+                if tag == TAG_PUT:
+                    key = key_factory(repository, cdata)
+                    break
+            i = 0
+            for id, cdata, tag, segment, offset in repository.scan_low_level():
+                if tag == TAG_PUT:
+                    decrypt_dump(i, id, cdata, tag='put', segment=segment, offset=offset)
+                elif tag == TAG_DELETE:
+                    decrypt_dump(i, id, None, tag='del', segment=segment, offset=offset)
+                elif tag == TAG_COMMIT:
+                    decrypt_dump(i, None, None, tag='commit', segment=segment, offset=offset)
                 i += 1
+        else:
+            # set up the key without depending on a manifest obj
+            ids = repository.list(limit=1, marker=None)
+            cdata = repository.get(ids[0])
+            key = key_factory(repository, cdata)
+            marker = None
+            i = 0
+            while True:
+                result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker)  # must use on-disk order scanning here
+                if not result:
+                    break
+                marker = result[-1]
+                for id in result:
+                    cdata = repository.get(id)
+                    decrypt_dump(i, id, cdata)
+                    i += 1
         print('Done.')
         return EXIT_SUCCESS
 
@@ -4062,6 +4090,8 @@ class Archiver:
         subparser.add_argument('location', metavar='REPOSITORY',
                                type=location_validator(archive=False),
                                help='repo to dump')
+        subparser.add_argument('--ghost', dest='ghost', action='store_true',
+                               help='dump all segment file contents, including deleted/uncommitted objects and commits.')
 
         debug_search_repo_objs_epilog = process_epilog("""
         This command searches raw (but decrypted and decompressed) repo objects for a specific bytes sequence.
diff --git a/src/borg/repository.py b/src/borg/repository.py
index 7d1bcdd6f..1a9ce9acc 100644
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -962,6 +962,23 @@ class Repository:
             logger.info('Completed repository check, no problems found.')
         return not error_found or repair
 
+    def scan_low_level(self):
+        """Very low level scan over all segment file entries.
+
+        It does NOT care about what's committed and what not.
+        It does NOT care whether an object might be deleted or superceded later.
+        It just yields anything it finds in the segment files.
+
+        This is intended as a last-resort way to get access to all repo contents of damaged repos,
+        when there is uncommitted, but valuable data in there...
+        """
+        for segment, filename in self.io.segment_iterator():
+            try:
+                for tag, key, offset, data in self.io.iter_objects(segment, include_data=True):
+                    yield key, data, tag, segment, offset
+            except IntegrityError as err:
+                logger.error('Segment %d (%s) has IntegrityError(s) [%s] - skipping.' % (segment, filename, str(err)))
+
     def _rollback(self, *, cleanup):
         """
         """

From 24812b03172365e7f3989ae98d49095cd7a4dba5 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 31 May 2018 14:55:43 +0200
Subject: [PATCH 4/4] dump-repo-objs: filename layout improvements

---
 src/borg/archiver.py           | 2 +-
 src/borg/testsuite/archiver.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/borg/archiver.py b/src/borg/archiver.py
index 470862244..9dd8bfbfc 100644
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@@ -1943,7 +1943,7 @@ class Archiver:
             segment_str = '_' + str(segment) if segment is not None else ''
             offset_str = '_' + str(offset) if offset is not None else ''
             id_str = '_' + bin_to_hex(id) if id is not None else ''
-            filename = '%06d%s%s%s%s.obj' % (i, tag_str, segment_str, offset_str, id_str)
+            filename = '%08d%s%s%s%s.obj' % (i, segment_str, offset_str, tag_str, id_str)
             print('Dumping', filename)
             with open(filename, 'wb') as fd:
                 fd.write(data)
diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py
index af203d545..a165866ca 100644
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@@ -2335,7 +2335,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         with changedir('output'):
             output = self.cmd('debug', 'dump-repo-objs', self.repository_location)
         output_dir = sorted(os.listdir('output'))
-        assert len(output_dir) > 0 and output_dir[0].startswith('000000_')
+        assert len(output_dir) > 0 and output_dir[0].startswith('00000000_')
         assert 'Done.' in output
 
     def test_debug_put_get_delete_obj(self):