From c2ed0d9ee0325bf0b7be60b943ed05d3301e53a9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 17 May 2018 21:59:07 +0200 Subject: [PATCH 1/4] implement borg debug search-repo-objs searchterm --- src/borg/archiver.py | 82 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 8d64c53b2..d1a699f65 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1956,6 +1956,73 @@ class Archiver: print('Done.') return EXIT_SUCCESS + @with_repository(manifest=False) + def do_debug_search_repo_objs(self, args, repository): + """search for byte sequences in repo objects, repo index MUST be current/correct""" + context = 32 + + def print_finding(info, wanted, data, offset): + before = data[offset - context:offset] + after = data[offset + len(wanted):offset + len(wanted) + context] + print('%s: %s %s %s == %r %r %r' % (info, before.hex(), wanted.hex(), after.hex(), + before, wanted, after)) + + wanted = args.wanted + try: + if wanted.startswith('hex:'): + wanted = unhexlify(wanted[4:]) + elif wanted.startswith('str:'): + wanted = wanted[4:].encode('utf-8') + else: + raise ValueError('unsupported search term') + except (ValueError, UnicodeEncodeError): + wanted = None + if not wanted: + self.print_error('search term needs to be hex:123abc or str:foobar style') + return EXIT_ERROR + + from .crypto.key import key_factory + # set up the key without depending on a manifest obj + ids = repository.list(limit=1, marker=None) + cdata = repository.get(ids[0]) + key = key_factory(repository, cdata) + + marker = None + last_data = b'' + last_id = None + i = 0 + while True: + result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here + if not result: + break + marker = result[-1] + for id in result: + cdata = repository.get(id) + give_id = id if id != Manifest.MANIFEST_ID else None + data = key.decrypt(give_id, cdata) + + # try to locate wanted sequence crossing the border of last_data and data + boundary_data = last_data[-(len(wanted) - 1):] + data[:len(wanted) - 1] + if wanted in boundary_data: + boundary_data = last_data[-(len(wanted) - 1 + context):] + data[:len(wanted) - 1 + context] + offset = boundary_data.find(wanted) + info = '%d %s | %s' % (i, last_id.hex(), id.hex()) + print_finding(info, wanted, boundary_data, offset) + + # try to locate wanted sequence in data + count = data.count(wanted) + if count: + offset = data.find(wanted) # only determine first occurance's offset + info = "%d %s #%d" % (i, id.hex(), count) + print_finding(info, wanted, data, offset) + + last_id, last_data = id, data + i += 1 + if i % 10000 == 0: + print('%d objects processed.' % i) + print('Done.') + return EXIT_SUCCESS + @with_repository(manifest=False) def do_debug_get_obj(self, args, repository): """get object contents from the repository and write it into file""" @@ -3996,6 +4063,21 @@ class Archiver: type=location_validator(archive=False), help='repo to dump') + debug_search_repo_objs_epilog = process_epilog(""" + This command searches raw (but decrypted and decompressed) repo objects for a specific bytes sequence. + """) + subparser = debug_parsers.add_parser('search-repo-objs', parents=[common_parser], add_help=False, + description=self.do_debug_search_repo_objs.__doc__, + epilog=debug_search_repo_objs_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='search repo objects (debug)') + subparser.set_defaults(func=self.do_debug_search_repo_objs) + subparser.add_argument('location', metavar='REPOSITORY', + type=location_validator(archive=False), + help='repo to search') + subparser.add_argument('wanted', metavar='WANTED', type=str, + help='term to search the repo for, either 0x1234abcd hex term or a string') + debug_get_obj_epilog = process_epilog(""" This command gets an object from the repository. """) From 2b7d65b0243567c66d80a616683684d892995102 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 21 May 2018 00:27:26 +0200 Subject: [PATCH 2/4] use repository.scan() for dump-repo-objs to get on-disk order it is also more efficient, avoids random access. --- src/borg/archiver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index d1a699f65..108b19476 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1930,7 +1930,7 @@ class Archiver: @with_repository(manifest=False) def do_debug_dump_repo_objs(self, args, repository): - """dump (decrypted, decompressed) repo objects""" + """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct""" from .crypto.key import key_factory # set up the key without depending on a manifest obj ids = repository.list(limit=1, marker=None) @@ -1940,7 +1940,7 @@ class Archiver: marker = None i = 0 while True: - result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here if not result: break marker = result[-1] From 8738e85967c3d6e515cc203bfa4f80958ff0a05e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 30 May 2018 18:43:04 +0200 Subject: [PATCH 3/4] implement borg debug dump-repo-objs --ghost intended as a last resort measure to export all segment file contents in a relatively easy to use format. if you want to dig into a damaged repo (e.g. missing segment files, missing commits) and you know what you do. note: dump-repo-objs --ghost must not use repo.list() because this would need the repo index and call get_transaction_id and check_transaction methods, which can easily fail on a damaged repo. thus we use the same low level scan method as we use anyway to get some encrypted piece of data to setup the decryption "key". --- src/borg/archiver.py | 66 ++++++++++++++++++++++++++++++------------ src/borg/repository.py | 17 +++++++++++ 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 108b19476..470862244 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -74,7 +74,7 @@ from .patterns import PatternMatcher from .item import Item from .platform import get_flags, get_process_id, SyncFile from .remote import RepositoryServer, RemoteRepository, cache_if_remote -from .repository import Repository, LIST_SCAN_LIMIT +from .repository import Repository, LIST_SCAN_LIMIT, TAG_PUT, TAG_DELETE, TAG_COMMIT from .selftest import selftest from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader @@ -1932,27 +1932,55 @@ class Archiver: def do_debug_dump_repo_objs(self, args, repository): """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct""" from .crypto.key import key_factory - # set up the key without depending on a manifest obj - ids = repository.list(limit=1, marker=None) - cdata = repository.get(ids[0]) - key = key_factory(repository, cdata) - marker = None - i = 0 - while True: - result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here - if not result: - break - marker = result[-1] - for id in result: - cdata = repository.get(id) + def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None): + if cdata is not None: give_id = id if id != Manifest.MANIFEST_ID else None data = key.decrypt(give_id, cdata) - filename = '%06d_%s.obj' % (i, bin_to_hex(id)) - print('Dumping', filename) - with open(filename, 'wb') as fd: - fd.write(data) + else: + data = b'' + tag_str = '' if tag is None else '_' + tag + segment_str = '_' + str(segment) if segment is not None else '' + offset_str = '_' + str(offset) if offset is not None else '' + id_str = '_' + bin_to_hex(id) if id is not None else '' + filename = '%06d%s%s%s%s.obj' % (i, tag_str, segment_str, offset_str, id_str) + print('Dumping', filename) + with open(filename, 'wb') as fd: + fd.write(data) + + if args.ghost: + # dump ghosty stuff from segment files: not yet committed objects, deleted / superceded objects, commit tags + + # set up the key without depending on a manifest obj + for id, cdata, tag, segment, offset in repository.scan_low_level(): + if tag == TAG_PUT: + key = key_factory(repository, cdata) + break + i = 0 + for id, cdata, tag, segment, offset in repository.scan_low_level(): + if tag == TAG_PUT: + decrypt_dump(i, id, cdata, tag='put', segment=segment, offset=offset) + elif tag == TAG_DELETE: + decrypt_dump(i, id, None, tag='del', segment=segment, offset=offset) + elif tag == TAG_COMMIT: + decrypt_dump(i, None, None, tag='commit', segment=segment, offset=offset) i += 1 + else: + # set up the key without depending on a manifest obj + ids = repository.list(limit=1, marker=None) + cdata = repository.get(ids[0]) + key = key_factory(repository, cdata) + marker = None + i = 0 + while True: + result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here + if not result: + break + marker = result[-1] + for id in result: + cdata = repository.get(id) + decrypt_dump(i, id, cdata) + i += 1 print('Done.') return EXIT_SUCCESS @@ -4062,6 +4090,8 @@ class Archiver: subparser.add_argument('location', metavar='REPOSITORY', type=location_validator(archive=False), help='repo to dump') + subparser.add_argument('--ghost', dest='ghost', action='store_true', + help='dump all segment file contents, including deleted/uncommitted objects and commits.') debug_search_repo_objs_epilog = process_epilog(""" This command searches raw (but decrypted and decompressed) repo objects for a specific bytes sequence. diff --git a/src/borg/repository.py b/src/borg/repository.py index 7d1bcdd6f..1a9ce9acc 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -962,6 +962,23 @@ class Repository: logger.info('Completed repository check, no problems found.') return not error_found or repair + def scan_low_level(self): + """Very low level scan over all segment file entries. + + It does NOT care about what's committed and what not. + It does NOT care whether an object might be deleted or superceded later. + It just yields anything it finds in the segment files. + + This is intended as a last-resort way to get access to all repo contents of damaged repos, + when there is uncommitted, but valuable data in there... + """ + for segment, filename in self.io.segment_iterator(): + try: + for tag, key, offset, data in self.io.iter_objects(segment, include_data=True): + yield key, data, tag, segment, offset + except IntegrityError as err: + logger.error('Segment %d (%s) has IntegrityError(s) [%s] - skipping.' % (segment, filename, str(err))) + def _rollback(self, *, cleanup): """ """ From 24812b03172365e7f3989ae98d49095cd7a4dba5 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 31 May 2018 14:55:43 +0200 Subject: [PATCH 4/4] dump-repo-objs: filename layout improvements --- src/borg/archiver.py | 2 +- src/borg/testsuite/archiver.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 470862244..9dd8bfbfc 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1943,7 +1943,7 @@ class Archiver: segment_str = '_' + str(segment) if segment is not None else '' offset_str = '_' + str(offset) if offset is not None else '' id_str = '_' + bin_to_hex(id) if id is not None else '' - filename = '%06d%s%s%s%s.obj' % (i, tag_str, segment_str, offset_str, id_str) + filename = '%08d%s%s%s%s.obj' % (i, segment_str, offset_str, tag_str, id_str) print('Dumping', filename) with open(filename, 'wb') as fd: fd.write(data) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index af203d545..a165866ca 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2335,7 +2335,7 @@ class ArchiverTestCase(ArchiverTestCaseBase): with changedir('output'): output = self.cmd('debug', 'dump-repo-objs', self.repository_location) output_dir = sorted(os.listdir('output')) - assert len(output_dir) > 0 and output_dir[0].startswith('000000_') + assert len(output_dir) > 0 and output_dir[0].startswith('00000000_') assert 'Done.' in output def test_debug_put_get_delete_obj(self):