From 26fe2a35cd409fcc1a3c52acd5a10bd7b755ee5d Mon Sep 17 00:00:00 2001 From: enkore Date: Sat, 12 Mar 2016 12:40:39 +0100 Subject: [PATCH] borg diff: find different files between archives --- AUTHORS | 1 + borg/archive.py | 2 + borg/archiver.py | 208 ++++++++++++++++++++++++++++++++++--- borg/testsuite/archiver.py | 68 ++++++++++++ 4 files changed, 266 insertions(+), 13 deletions(-) diff --git a/AUTHORS b/AUTHORS index 4788133e4..88a111641 100644 --- a/AUTHORS +++ b/AUTHORS @@ -7,6 +7,7 @@ Borg Contributors ("The Borg Collective") - Yuri D'Elia - Michael Hanselmann - Teemu Toivanen +- Marian Beermann Borg is a fork of Attic. diff --git a/borg/archive.py b/borg/archive.py index 791caaa7f..617d2ebe0 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -145,6 +145,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False, self.numeric_owner = numeric_owner if start is None: start = datetime.utcnow() + self.chunker_params = chunker_params self.start = start if end is None: end = datetime.utcnow() @@ -261,6 +262,7 @@ def save(self, name=None, timestamp=None): 'username': getuser(), 'time': start.isoformat(), 'time_end': end.isoformat(), + 'chunker_params': self.chunker_params, }) data = msgpack.packb(metadata, unicode_errors='surrogateescape') self.id = self.key.id_hash(data) diff --git a/borg/archiver.py b/borg/archiver.py index ab5e04681..eda6a2ea6 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -1,9 +1,10 @@ from binascii import hexlify, unhexlify from datetime import datetime -from hashlib import sha256 +from itertools import zip_longest from operator import attrgetter import argparse import functools +import hashlib import inspect import io import os @@ -81,6 +82,45 @@ def print_file_status(self, status, path): if self.output_list and (self.output_filter is None or status in self.output_filter): logger.info("%1s %s", status, remove_surrogates(path)) + @staticmethod + def compare_chunk_contents(chunks1, chunks2): + """Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)""" + end = object() + alen = ai = 0 + blen = bi = 0 + while True: + if not alen - ai: + a = next(chunks1, end) + if a is end: + return not blen - bi and next(chunks2, end) is end + a = memoryview(a) + alen = len(a) + ai = 0 + if not blen - bi: + b = next(chunks2, end) + if b is end: + return not alen - ai and next(chunks1, end) is end + b = memoryview(b) + blen = len(b) + bi = 0 + slicelen = min(alen - ai, blen - bi) + if a[ai:ai + slicelen] != b[bi:bi + slicelen]: + return False + ai += slicelen + bi += slicelen + + @staticmethod + def build_matcher(excludes, paths): + matcher = PatternMatcher() + if excludes: + matcher.add(excludes, False) + include_patterns = [] + if paths: + include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in paths) + matcher.add(include_patterns, True) + matcher.fallback = not include_patterns + return matcher, include_patterns + def do_serve(self, args): """Start in server mode. This command is usually not used manually. """ @@ -304,17 +344,7 @@ def do_extract(self, args): archive = Archive(repository, key, manifest, args.location.archive, numeric_owner=args.numeric_owner) - matcher = PatternMatcher() - if args.excludes: - matcher.add(args.excludes, False) - - include_patterns = [] - - if args.paths: - include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in args.paths) - matcher.add(include_patterns, True) - - matcher.fallback = not include_patterns + matcher, include_patterns = self.build_matcher(args.excludes, args.paths) output_list = args.output_list dry_run = args.dry_run @@ -353,6 +383,123 @@ def do_extract(self, args): self.print_warning("Include pattern '%s' never matched.", pattern) return self.exit_code + def do_diff(self, args): + """Diff contents of two archives""" + def format_bytes(count): + if count is None: + return "" + return format_file_size(count) + + def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2): + chunks1 = archive1.pipeline.fetch_many(chunk_ids1) + chunks2 = archive2.pipeline.fetch_many(chunk_ids2) + return self.compare_chunk_contents(chunks1, chunks2) + + def get_owner(item): + if args.numeric_owner: + return item[b'uid'], item[b'gid'] + else: + return item[b'user'], item[b'group'] + + def compare_items(path, item1, item2, deleted=False): + """ + Compare two items with identical paths. + :param deleted: Whether one of the items has been deleted + """ + if not deleted: + if item1[b'mode'] != item2[b'mode']: + print(remove_surrogates(path), 'different mode') + print('\t', args.location.archive, stat.filemode(item1[b'mode'])) + print('\t', args.archive2, stat.filemode(item2[b'mode'])) + + user1, group1 = get_owner(item1) + user2, group2 = get_owner(item2) + if user1 != user2 or group1 != group2: + print(remove_surrogates(path), 'different owner') + print('\t', args.location.archive, 'user=%s, group=%s' % (user1, group1)) + print('\t', args.archive2, 'user=%s, group=%s' % (user2, group2)) + + if not stat.S_ISREG(item1[b'mode']): + return + if b'chunks' not in item1 or b'chunks' not in item2: + # At least one of the items is a link + if item1.get(b'source') != item2.get(b'source'): + print(remove_surrogates(path), 'different link') + print('\t', args.location.archive, item1.get(b'source', '')) + print('\t', args.archive2, item2.get(b'source', '')) + return + if deleted or not can_compare_chunk_ids or item1[b'chunks'] != item2[b'chunks']: + # Contents are different + chunk_ids1 = [c[0] for c in item1[b'chunks']] + chunk_ids2 = [c[0] for c in item2[b'chunks']] + chunk_id_set1 = set(chunk_ids1) + chunk_id_set2 = set(chunk_ids2) + total1 = None if item1.get(b'deleted') else sum(c[1] for c in item1[b'chunks']) + total2 = None if item2.get(b'deleted') else sum(c[1] for c in item2[b'chunks']) + if (not can_compare_chunk_ids and total1 == total2 and not deleted and + fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)): + return + added = sum(c[1] for c in (chunk_id_set2 - chunk_id_set1)) + removed = sum(c[1] for c in (chunk_id_set1 - chunk_id_set2)) + print(remove_surrogates(path), 'different contents') + print('\t +%s, -%s, %s, %s' % (format_bytes(added), format_bytes(removed), + format_bytes(total1), format_bytes(total2))) + + def compare_archives(archive1, archive2, matcher): + orphans_archive1 = {} + orphans_archive2 = {} + for item1, item2 in zip_longest( + archive1.iter_items(lambda item: matcher.match(item[b'path'])), + archive2.iter_items(lambda item: matcher.match(item[b'path'])), + ): + if item1 and item2 and item1[b'path'] == item2[b'path']: + compare_items(item1[b'path'], item1, item2) + continue + if item1: + matching_orphan = orphans_archive2.pop(item1[b'path'], None) + if matching_orphan: + compare_items(item1[b'path'], item1, matching_orphan) + else: + orphans_archive1[item1[b'path']] = item1 + if item2: + matching_orphan = orphans_archive1.pop(item2[b'path'], None) + if matching_orphan: + compare_items(item2[b'path'], matching_orphan, item2) + else: + orphans_archive2[item2[b'path']] = item2 + # At this point orphans_* contain items that had no matching partner in the other archive + for added in orphans_archive2.values(): + compare_items(added[b'path'], { + b'deleted': True, + b'chunks': [], + }, added, deleted=True) + for deleted in orphans_archive1.values(): + compare_items(deleted[b'path'], deleted, { + b'deleted': True, + b'chunks': [], + }, deleted=True) + + repository = self.open_repository(args) + manifest, key = Manifest.load(repository) + archive1 = Archive(repository, key, manifest, args.location.archive) + archive2 = Archive(repository, key, manifest, args.archive2) + + can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get( + b'chunker_params', True) or args.same_chunker_params + if not can_compare_chunk_ids: + self.print_warning('--chunker-params might be different between archives, diff will be slow.\n' + 'If you know for certain that they are the same, pass --same-chunker-params ' + 'to override this check.') + + matcher, include_patterns = self.build_matcher(args.excludes, args.paths) + + compare_archives(archive1, archive2, matcher) + + for pattern in include_patterns: + if pattern.match_count == 0: + self.print_warning("Include pattern '%s' never matched.", pattern) + return self.exit_code + def do_rename(self, args): """Rename an existing archive""" repository = self.open_repository(args, exclusive=True) @@ -649,7 +796,7 @@ def do_debug_put_obj(self, args): for path in args.paths: with open(path, "rb") as f: data = f.read() - h = sha256(data) # XXX hardcoded + h = hashlib.sha256(data) # XXX hardcoded repository.put(h.digest(), data) print("object %s put." % h.hexdigest()) repository.commit() @@ -1085,6 +1232,41 @@ def build_parser(self, args=None, prog=None): subparser.add_argument('paths', metavar='PATH', nargs='*', type=str, help='paths to extract; patterns are supported') + diff_epilog = textwrap.dedent(""" + This command finds differences in files (contents, user, group, mode) between archives. + + Both archives need to be in the same repository, and a repository location may only + be specified for ARCHIVE1. + + See the output of the "borg help patterns" command for more help on exclude patterns. + """) + subparser = subparsers.add_parser('diff', parents=[common_parser], + description=self.do_diff.__doc__, + epilog=diff_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='find differences in archive contents') + subparser.set_defaults(func=self.do_diff) + subparser.add_argument('-e', '--exclude', dest='excludes', + type=parse_pattern, action='append', + metavar="PATTERN", help='exclude paths matching PATTERN') + subparser.add_argument('--exclude-from', dest='exclude_files', + type=argparse.FileType('r'), action='append', + metavar='EXCLUDEFILE', help='read exclude patterns from EXCLUDEFILE, one per line') + subparser.add_argument('--numeric-owner', dest='numeric_owner', + action='store_true', default=False, + help='only obey numeric user and group identifiers') + subparser.add_argument('--same-chunker-params', dest='same_chunker_params', + action='store_true', default=False, + help='Override check of chunker parameters.') + subparser.add_argument('location', metavar='ARCHIVE1', + type=location_validator(archive=True), + help='archive') + subparser.add_argument('archive2', metavar='ARCHIVE2', + type=str, + help='archive to compare with ARCHIVE1 (no repository location)') + subparser.add_argument('paths', metavar='PATH', nargs='*', type=str, + help='paths to compare; patterns are supported') + rename_epilog = textwrap.dedent(""" This command renames an archive in the repository. """) diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 2ca410a01..8e2f35c65 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -1143,6 +1143,43 @@ def test_debug_put_get_delete_obj(self): pass +class DiffArchiverTestCase(ArchiverTestCaseBase): + create_test_files = ArchiverTestCase.create_test_files + create_regular_file = ArchiverTestCase.create_regular_file + + def test_basic_functionality(self): + self.create_test_files() + self.cmd('init', self.repository_location) + os.chmod('input/dir2', stat.S_IFDIR | 0o755) + self.create_regular_file('file3', size=1024) + self.cmd('create', self.repository_location + '::test0', 'input') + # replace 'hardlink' with a file + os.unlink('input/hardlink') + self.create_regular_file('hardlink', size=1024 * 80) + # replace directory with a file + os.unlink('input/dir2/file2') + os.rmdir('input/dir2') + self.create_regular_file('dir2', size=1024 * 80) + os.chmod('input/dir2', stat.S_IFREG | 0o755) + self.create_regular_file('file3', size=1024, contents=b'0') + self.cmd('create', self.repository_location + '::test1a', 'input') + self.cmd('create', '--chunker-params', '16,18,17,4095', self.repository_location + '::test1b', 'input') + + def do_asserts(output, archive): + assert 'input/file3 different contents' in output + assert 'input/hardlink different mode' in output + assert ('input/hardlink different link\n' + ' test0 input/file1\n' + ' test%s ' % archive) in output + assert ('input/dir2 different mode\n' + ' test0 drwxr-xr-x\n' + ' test%s -rwxr-xr-x\n' % archive) in output + assert 'input/dir2/file2 different contents' in output + do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1a'), '1a') + # We expect exit_code=1 due to the chunker params warning + do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1b', exit_code=1), '1b') + + def test_get_args(): archiver = Archiver() # everything normal: @@ -1162,3 +1199,34 @@ def test_get_args(): args = archiver.get_args(['borg', 'serve', '--restrict-to-path=/p1', '--restrict-to-path=/p2', ], 'borg init /') assert args.func == archiver.do_serve + + +def test_compare_chunk_contents(): + def ccc(a, b): + compare1 = Archiver.compare_chunk_contents(iter(a), iter(b)) + compare2 = Archiver.compare_chunk_contents(iter(b), iter(a)) + assert compare1 == compare2 + return compare1 + assert ccc([ + b'1234', b'567A', b'bC' + ], [ + b'1', b'23', b'4567A', b'b', b'C' + ]) + # one iterator exhausted before the other + assert not ccc([ + b'12345', + ], [ + b'1234', b'56' + ]) + # content mismatch + assert not ccc([ + b'1234', b'65' + ], [ + b'1234', b'56' + ]) + # first is the prefix of second + assert not ccc([ + b'1234', b'56' + ], [ + b'1234', b'565' + ])