diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 8d64c53b2..9dd8bfbfc 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -74,7 +74,7 @@ from .patterns import PatternMatcher from .item import Item from .platform import get_flags, get_process_id, SyncFile from .remote import RepositoryServer, RemoteRepository, cache_if_remote -from .repository import Repository, LIST_SCAN_LIMIT +from .repository import Repository, LIST_SCAN_LIMIT, TAG_PUT, TAG_DELETE, TAG_COMMIT from .selftest import selftest from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader @@ -1930,7 +1930,85 @@ class Archiver: @with_repository(manifest=False) def do_debug_dump_repo_objs(self, args, repository): - """dump (decrypted, decompressed) repo objects""" + """dump (decrypted, decompressed) repo objects, repo index MUST be current/correct""" + from .crypto.key import key_factory + + def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None): + if cdata is not None: + give_id = id if id != Manifest.MANIFEST_ID else None + data = key.decrypt(give_id, cdata) + else: + data = b'' + tag_str = '' if tag is None else '_' + tag + segment_str = '_' + str(segment) if segment is not None else '' + offset_str = '_' + str(offset) if offset is not None else '' + id_str = '_' + bin_to_hex(id) if id is not None else '' + filename = '%08d%s%s%s%s.obj' % (i, segment_str, offset_str, tag_str, id_str) + print('Dumping', filename) + with open(filename, 'wb') as fd: + fd.write(data) + + if args.ghost: + # dump ghosty stuff from segment files: not yet committed objects, deleted / superceded objects, commit tags + + # set up the key without depending on a manifest obj + for id, cdata, tag, segment, offset in repository.scan_low_level(): + if tag == TAG_PUT: + key = key_factory(repository, cdata) + break + i = 0 + for id, cdata, tag, segment, offset in repository.scan_low_level(): + if tag == TAG_PUT: + decrypt_dump(i, id, cdata, tag='put', segment=segment, offset=offset) + elif tag == TAG_DELETE: + decrypt_dump(i, id, None, tag='del', segment=segment, offset=offset) + elif tag == TAG_COMMIT: + decrypt_dump(i, None, None, tag='commit', segment=segment, offset=offset) + i += 1 + else: + # set up the key without depending on a manifest obj + ids = repository.list(limit=1, marker=None) + cdata = repository.get(ids[0]) + key = key_factory(repository, cdata) + marker = None + i = 0 + while True: + result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here + if not result: + break + marker = result[-1] + for id in result: + cdata = repository.get(id) + decrypt_dump(i, id, cdata) + i += 1 + print('Done.') + return EXIT_SUCCESS + + @with_repository(manifest=False) + def do_debug_search_repo_objs(self, args, repository): + """search for byte sequences in repo objects, repo index MUST be current/correct""" + context = 32 + + def print_finding(info, wanted, data, offset): + before = data[offset - context:offset] + after = data[offset + len(wanted):offset + len(wanted) + context] + print('%s: %s %s %s == %r %r %r' % (info, before.hex(), wanted.hex(), after.hex(), + before, wanted, after)) + + wanted = args.wanted + try: + if wanted.startswith('hex:'): + wanted = unhexlify(wanted[4:]) + elif wanted.startswith('str:'): + wanted = wanted[4:].encode('utf-8') + else: + raise ValueError('unsupported search term') + except (ValueError, UnicodeEncodeError): + wanted = None + if not wanted: + self.print_error('search term needs to be hex:123abc or str:foobar style') + return EXIT_ERROR + from .crypto.key import key_factory # set up the key without depending on a manifest obj ids = repository.list(limit=1, marker=None) @@ -1938,9 +2016,11 @@ class Archiver: key = key_factory(repository, cdata) marker = None + last_data = b'' + last_id = None i = 0 while True: - result = repository.list(limit=LIST_SCAN_LIMIT, marker=marker) + result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here if not result: break marker = result[-1] @@ -1948,11 +2028,26 @@ class Archiver: cdata = repository.get(id) give_id = id if id != Manifest.MANIFEST_ID else None data = key.decrypt(give_id, cdata) - filename = '%06d_%s.obj' % (i, bin_to_hex(id)) - print('Dumping', filename) - with open(filename, 'wb') as fd: - fd.write(data) + + # try to locate wanted sequence crossing the border of last_data and data + boundary_data = last_data[-(len(wanted) - 1):] + data[:len(wanted) - 1] + if wanted in boundary_data: + boundary_data = last_data[-(len(wanted) - 1 + context):] + data[:len(wanted) - 1 + context] + offset = boundary_data.find(wanted) + info = '%d %s | %s' % (i, last_id.hex(), id.hex()) + print_finding(info, wanted, boundary_data, offset) + + # try to locate wanted sequence in data + count = data.count(wanted) + if count: + offset = data.find(wanted) # only determine first occurance's offset + info = "%d %s #%d" % (i, id.hex(), count) + print_finding(info, wanted, data, offset) + + last_id, last_data = id, data i += 1 + if i % 10000 == 0: + print('%d objects processed.' % i) print('Done.') return EXIT_SUCCESS @@ -3995,6 +4090,23 @@ class Archiver: subparser.add_argument('location', metavar='REPOSITORY', type=location_validator(archive=False), help='repo to dump') + subparser.add_argument('--ghost', dest='ghost', action='store_true', + help='dump all segment file contents, including deleted/uncommitted objects and commits.') + + debug_search_repo_objs_epilog = process_epilog(""" + This command searches raw (but decrypted and decompressed) repo objects for a specific bytes sequence. + """) + subparser = debug_parsers.add_parser('search-repo-objs', parents=[common_parser], add_help=False, + description=self.do_debug_search_repo_objs.__doc__, + epilog=debug_search_repo_objs_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='search repo objects (debug)') + subparser.set_defaults(func=self.do_debug_search_repo_objs) + subparser.add_argument('location', metavar='REPOSITORY', + type=location_validator(archive=False), + help='repo to search') + subparser.add_argument('wanted', metavar='WANTED', type=str, + help='term to search the repo for, either 0x1234abcd hex term or a string') debug_get_obj_epilog = process_epilog(""" This command gets an object from the repository. diff --git a/src/borg/repository.py b/src/borg/repository.py index 7d1bcdd6f..1a9ce9acc 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -962,6 +962,23 @@ class Repository: logger.info('Completed repository check, no problems found.') return not error_found or repair + def scan_low_level(self): + """Very low level scan over all segment file entries. + + It does NOT care about what's committed and what not. + It does NOT care whether an object might be deleted or superceded later. + It just yields anything it finds in the segment files. + + This is intended as a last-resort way to get access to all repo contents of damaged repos, + when there is uncommitted, but valuable data in there... + """ + for segment, filename in self.io.segment_iterator(): + try: + for tag, key, offset, data in self.io.iter_objects(segment, include_data=True): + yield key, data, tag, segment, offset + except IntegrityError as err: + logger.error('Segment %d (%s) has IntegrityError(s) [%s] - skipping.' % (segment, filename, str(err))) + def _rollback(self, *, cleanup): """ """ diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index af203d545..a165866ca 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2335,7 +2335,7 @@ class ArchiverTestCase(ArchiverTestCaseBase): with changedir('output'): output = self.cmd('debug', 'dump-repo-objs', self.repository_location) output_dir = sorted(os.listdir('output')) - assert len(output_dir) > 0 and output_dir[0].startswith('000000_') + assert len(output_dir) > 0 and output_dir[0].startswith('00000000_') assert 'Done.' in output def test_debug_put_get_delete_obj(self):