Merge pull request #740 from enkore/feature-diff

borg diff: find different files between archives
This commit is contained in:
TW 2016-03-17 00:23:23 +01:00
commit f0cb6379b7
4 changed files with 266 additions and 13 deletions

View File

@ -7,6 +7,7 @@ Borg Contributors ("The Borg Collective")
- Yuri D'Elia
- Michael Hanselmann <public@hansmi.ch>
- Teemu Toivanen <public@profnetti.fi>
- Marian Beermann <public@enkore.de>
Borg is a fork of Attic.

View File

@ -145,6 +145,7 @@ class Archive:
self.numeric_owner = numeric_owner
if start is None:
start = datetime.utcnow()
self.chunker_params = chunker_params
self.start = start
if end is None:
end = datetime.utcnow()
@ -261,6 +262,7 @@ Number of files: {0.stats.nfiles}'''.format(
'username': getuser(),
'time': start.isoformat(),
'time_end': end.isoformat(),
'chunker_params': self.chunker_params,
})
data = msgpack.packb(metadata, unicode_errors='surrogateescape')
self.id = self.key.id_hash(data)

View File

@ -1,9 +1,10 @@
from binascii import hexlify, unhexlify
from datetime import datetime
from hashlib import sha256
from itertools import zip_longest
from operator import attrgetter
import argparse
import functools
import hashlib
import inspect
import io
import os
@ -81,6 +82,45 @@ class Archiver:
if self.output_list and (self.output_filter is None or status in self.output_filter):
logger.info("%1s %s", status, remove_surrogates(path))
@staticmethod
def compare_chunk_contents(chunks1, chunks2):
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)"""
end = object()
alen = ai = 0
blen = bi = 0
while True:
if not alen - ai:
a = next(chunks1, end)
if a is end:
return not blen - bi and next(chunks2, end) is end
a = memoryview(a)
alen = len(a)
ai = 0
if not blen - bi:
b = next(chunks2, end)
if b is end:
return not alen - ai and next(chunks1, end) is end
b = memoryview(b)
blen = len(b)
bi = 0
slicelen = min(alen - ai, blen - bi)
if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
return False
ai += slicelen
bi += slicelen
@staticmethod
def build_matcher(excludes, paths):
matcher = PatternMatcher()
if excludes:
matcher.add(excludes, False)
include_patterns = []
if paths:
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in paths)
matcher.add(include_patterns, True)
matcher.fallback = not include_patterns
return matcher, include_patterns
def do_serve(self, args):
"""Start in server mode. This command is usually not used manually.
"""
@ -305,17 +345,7 @@ class Archiver:
archive = Archive(repository, key, manifest, args.location.archive,
numeric_owner=args.numeric_owner)
matcher = PatternMatcher()
if args.excludes:
matcher.add(args.excludes, False)
include_patterns = []
if args.paths:
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in args.paths)
matcher.add(include_patterns, True)
matcher.fallback = not include_patterns
matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
output_list = args.output_list
dry_run = args.dry_run
@ -354,6 +384,123 @@ class Archiver:
self.print_warning("Include pattern '%s' never matched.", pattern)
return self.exit_code
def do_diff(self, args):
"""Diff contents of two archives"""
def format_bytes(count):
if count is None:
return "<deleted>"
return format_file_size(count)
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2):
chunks1 = archive1.pipeline.fetch_many(chunk_ids1)
chunks2 = archive2.pipeline.fetch_many(chunk_ids2)
return self.compare_chunk_contents(chunks1, chunks2)
def get_owner(item):
if args.numeric_owner:
return item[b'uid'], item[b'gid']
else:
return item[b'user'], item[b'group']
def compare_items(path, item1, item2, deleted=False):
"""
Compare two items with identical paths.
:param deleted: Whether one of the items has been deleted
"""
if not deleted:
if item1[b'mode'] != item2[b'mode']:
print(remove_surrogates(path), 'different mode')
print('\t', args.location.archive, stat.filemode(item1[b'mode']))
print('\t', args.archive2, stat.filemode(item2[b'mode']))
user1, group1 = get_owner(item1)
user2, group2 = get_owner(item2)
if user1 != user2 or group1 != group2:
print(remove_surrogates(path), 'different owner')
print('\t', args.location.archive, 'user=%s, group=%s' % (user1, group1))
print('\t', args.archive2, 'user=%s, group=%s' % (user2, group2))
if not stat.S_ISREG(item1[b'mode']):
return
if b'chunks' not in item1 or b'chunks' not in item2:
# At least one of the items is a link
if item1.get(b'source') != item2.get(b'source'):
print(remove_surrogates(path), 'different link')
print('\t', args.location.archive, item1.get(b'source', '<regular file>'))
print('\t', args.archive2, item2.get(b'source', '<regular file>'))
return
if deleted or not can_compare_chunk_ids or item1[b'chunks'] != item2[b'chunks']:
# Contents are different
chunk_ids1 = [c[0] for c in item1[b'chunks']]
chunk_ids2 = [c[0] for c in item2[b'chunks']]
chunk_id_set1 = set(chunk_ids1)
chunk_id_set2 = set(chunk_ids2)
total1 = None if item1.get(b'deleted') else sum(c[1] for c in item1[b'chunks'])
total2 = None if item2.get(b'deleted') else sum(c[1] for c in item2[b'chunks'])
if (not can_compare_chunk_ids and total1 == total2 and not deleted and
fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)):
return
added = sum(c[1] for c in (chunk_id_set2 - chunk_id_set1))
removed = sum(c[1] for c in (chunk_id_set1 - chunk_id_set2))
print(remove_surrogates(path), 'different contents')
print('\t +%s, -%s, %s, %s' % (format_bytes(added), format_bytes(removed),
format_bytes(total1), format_bytes(total2)))
def compare_archives(archive1, archive2, matcher):
orphans_archive1 = {}
orphans_archive2 = {}
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item[b'path'])),
archive2.iter_items(lambda item: matcher.match(item[b'path'])),
):
if item1 and item2 and item1[b'path'] == item2[b'path']:
compare_items(item1[b'path'], item1, item2)
continue
if item1:
matching_orphan = orphans_archive2.pop(item1[b'path'], None)
if matching_orphan:
compare_items(item1[b'path'], item1, matching_orphan)
else:
orphans_archive1[item1[b'path']] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2[b'path'], None)
if matching_orphan:
compare_items(item2[b'path'], matching_orphan, item2)
else:
orphans_archive2[item2[b'path']] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
for added in orphans_archive2.values():
compare_items(added[b'path'], {
b'deleted': True,
b'chunks': [],
}, added, deleted=True)
for deleted in orphans_archive1.values():
compare_items(deleted[b'path'], deleted, {
b'deleted': True,
b'chunks': [],
}, deleted=True)
repository = self.open_repository(args)
manifest, key = Manifest.load(repository)
archive1 = Archive(repository, key, manifest, args.location.archive)
archive2 = Archive(repository, key, manifest, args.archive2)
can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get(
b'chunker_params', True) or args.same_chunker_params
if not can_compare_chunk_ids:
self.print_warning('--chunker-params might be different between archives, diff will be slow.\n'
'If you know for certain that they are the same, pass --same-chunker-params '
'to override this check.')
matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
compare_archives(archive1, archive2, matcher)
for pattern in include_patterns:
if pattern.match_count == 0:
self.print_warning("Include pattern '%s' never matched.", pattern)
return self.exit_code
def do_rename(self, args):
"""Rename an existing archive"""
repository = self.open_repository(args, exclusive=True)
@ -650,7 +797,7 @@ class Archiver:
for path in args.paths:
with open(path, "rb") as f:
data = f.read()
h = sha256(data) # XXX hardcoded
h = hashlib.sha256(data) # XXX hardcoded
repository.put(h.digest(), data)
print("object %s put." % h.hexdigest())
repository.commit()
@ -1095,6 +1242,41 @@ class Archiver:
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to extract; patterns are supported')
diff_epilog = textwrap.dedent("""
This command finds differences in files (contents, user, group, mode) between archives.
Both archives need to be in the same repository, and a repository location may only
be specified for ARCHIVE1.
See the output of the "borg help patterns" command for more help on exclude patterns.
""")
subparser = subparsers.add_parser('diff', parents=[common_parser],
description=self.do_diff.__doc__,
epilog=diff_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help='find differences in archive contents')
subparser.set_defaults(func=self.do_diff)
subparser.add_argument('-e', '--exclude', dest='excludes',
type=parse_pattern, action='append',
metavar="PATTERN", help='exclude paths matching PATTERN')
subparser.add_argument('--exclude-from', dest='exclude_files',
type=argparse.FileType('r'), action='append',
metavar='EXCLUDEFILE', help='read exclude patterns from EXCLUDEFILE, one per line')
subparser.add_argument('--numeric-owner', dest='numeric_owner',
action='store_true', default=False,
help='only consider numeric user and group identifiers')
subparser.add_argument('--same-chunker-params', dest='same_chunker_params',
action='store_true', default=False,
help='Override check of chunker parameters.')
subparser.add_argument('location', metavar='ARCHIVE1',
type=location_validator(archive=True),
help='archive')
subparser.add_argument('archive2', metavar='ARCHIVE2',
type=archivename_validator(),
help='archive to compare with ARCHIVE1 (no repository location)')
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to compare; patterns are supported')
rename_epilog = textwrap.dedent("""
This command renames an archive in the repository.
""")

View File

@ -1143,6 +1143,43 @@ class RemoteArchiverTestCase(ArchiverTestCase):
pass
class DiffArchiverTestCase(ArchiverTestCaseBase):
create_test_files = ArchiverTestCase.create_test_files
create_regular_file = ArchiverTestCase.create_regular_file
def test_basic_functionality(self):
self.create_test_files()
self.cmd('init', self.repository_location)
os.chmod('input/dir2', stat.S_IFDIR | 0o755)
self.create_regular_file('file3', size=1024)
self.cmd('create', self.repository_location + '::test0', 'input')
# replace 'hardlink' with a file
os.unlink('input/hardlink')
self.create_regular_file('hardlink', size=1024 * 80)
# replace directory with a file
os.unlink('input/dir2/file2')
os.rmdir('input/dir2')
self.create_regular_file('dir2', size=1024 * 80)
os.chmod('input/dir2', stat.S_IFREG | 0o755)
self.create_regular_file('file3', size=1024, contents=b'0')
self.cmd('create', self.repository_location + '::test1a', 'input')
self.cmd('create', '--chunker-params', '16,18,17,4095', self.repository_location + '::test1b', 'input')
def do_asserts(output, archive):
assert 'input/file3 different contents' in output
assert 'input/hardlink different mode' in output
assert ('input/hardlink different link\n'
' test0 input/file1\n'
' test%s <regular file>' % archive) in output
assert ('input/dir2 different mode\n'
' test0 drwxr-xr-x\n'
' test%s -rwxr-xr-x\n' % archive) in output
assert 'input/dir2/file2 different contents' in output
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1a'), '1a')
# We expect exit_code=1 due to the chunker params warning
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1b', exit_code=1), '1b')
def test_get_args():
archiver = Archiver()
# everything normal:
@ -1162,3 +1199,34 @@ def test_get_args():
args = archiver.get_args(['borg', 'serve', '--restrict-to-path=/p1', '--restrict-to-path=/p2', ],
'borg init /')
assert args.func == archiver.do_serve
def test_compare_chunk_contents():
def ccc(a, b):
compare1 = Archiver.compare_chunk_contents(iter(a), iter(b))
compare2 = Archiver.compare_chunk_contents(iter(b), iter(a))
assert compare1 == compare2
return compare1
assert ccc([
b'1234', b'567A', b'bC'
], [
b'1', b'23', b'4567A', b'b', b'C'
])
# one iterator exhausted before the other
assert not ccc([
b'12345',
], [
b'1234', b'56'
])
# content mismatch
assert not ccc([
b'1234', b'65'
], [
b'1234', b'56'
])
# first is the prefix of second
assert not ccc([
b'1234', b'56'
], [
b'1234', b'565'
])