mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-22 07:43:06 +00:00
borg diff: find different files between archives
This commit is contained in:
parent
5e5bab3239
commit
26fe2a35cd
4 changed files with 266 additions and 13 deletions
1
AUTHORS
1
AUTHORS
|
@ -7,6 +7,7 @@ Borg Contributors ("The Borg Collective")
|
|||
- Yuri D'Elia
|
||||
- Michael Hanselmann <public@hansmi.ch>
|
||||
- Teemu Toivanen <public@profnetti.fi>
|
||||
- Marian Beermann <public@enkore.de>
|
||||
|
||||
Borg is a fork of Attic.
|
||||
|
||||
|
|
|
@ -145,6 +145,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
|||
self.numeric_owner = numeric_owner
|
||||
if start is None:
|
||||
start = datetime.utcnow()
|
||||
self.chunker_params = chunker_params
|
||||
self.start = start
|
||||
if end is None:
|
||||
end = datetime.utcnow()
|
||||
|
@ -261,6 +262,7 @@ def save(self, name=None, timestamp=None):
|
|||
'username': getuser(),
|
||||
'time': start.isoformat(),
|
||||
'time_end': end.isoformat(),
|
||||
'chunker_params': self.chunker_params,
|
||||
})
|
||||
data = msgpack.packb(metadata, unicode_errors='surrogateescape')
|
||||
self.id = self.key.id_hash(data)
|
||||
|
|
208
borg/archiver.py
208
borg/archiver.py
|
@ -1,9 +1,10 @@
|
|||
from binascii import hexlify, unhexlify
|
||||
from datetime import datetime
|
||||
from hashlib import sha256
|
||||
from itertools import zip_longest
|
||||
from operator import attrgetter
|
||||
import argparse
|
||||
import functools
|
||||
import hashlib
|
||||
import inspect
|
||||
import io
|
||||
import os
|
||||
|
@ -81,6 +82,45 @@ def print_file_status(self, status, path):
|
|||
if self.output_list and (self.output_filter is None or status in self.output_filter):
|
||||
logger.info("%1s %s", status, remove_surrogates(path))
|
||||
|
||||
@staticmethod
|
||||
def compare_chunk_contents(chunks1, chunks2):
|
||||
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)"""
|
||||
end = object()
|
||||
alen = ai = 0
|
||||
blen = bi = 0
|
||||
while True:
|
||||
if not alen - ai:
|
||||
a = next(chunks1, end)
|
||||
if a is end:
|
||||
return not blen - bi and next(chunks2, end) is end
|
||||
a = memoryview(a)
|
||||
alen = len(a)
|
||||
ai = 0
|
||||
if not blen - bi:
|
||||
b = next(chunks2, end)
|
||||
if b is end:
|
||||
return not alen - ai and next(chunks1, end) is end
|
||||
b = memoryview(b)
|
||||
blen = len(b)
|
||||
bi = 0
|
||||
slicelen = min(alen - ai, blen - bi)
|
||||
if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
|
||||
return False
|
||||
ai += slicelen
|
||||
bi += slicelen
|
||||
|
||||
@staticmethod
|
||||
def build_matcher(excludes, paths):
|
||||
matcher = PatternMatcher()
|
||||
if excludes:
|
||||
matcher.add(excludes, False)
|
||||
include_patterns = []
|
||||
if paths:
|
||||
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in paths)
|
||||
matcher.add(include_patterns, True)
|
||||
matcher.fallback = not include_patterns
|
||||
return matcher, include_patterns
|
||||
|
||||
def do_serve(self, args):
|
||||
"""Start in server mode. This command is usually not used manually.
|
||||
"""
|
||||
|
@ -304,17 +344,7 @@ def do_extract(self, args):
|
|||
archive = Archive(repository, key, manifest, args.location.archive,
|
||||
numeric_owner=args.numeric_owner)
|
||||
|
||||
matcher = PatternMatcher()
|
||||
if args.excludes:
|
||||
matcher.add(args.excludes, False)
|
||||
|
||||
include_patterns = []
|
||||
|
||||
if args.paths:
|
||||
include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in args.paths)
|
||||
matcher.add(include_patterns, True)
|
||||
|
||||
matcher.fallback = not include_patterns
|
||||
matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
|
||||
|
||||
output_list = args.output_list
|
||||
dry_run = args.dry_run
|
||||
|
@ -353,6 +383,123 @@ def do_extract(self, args):
|
|||
self.print_warning("Include pattern '%s' never matched.", pattern)
|
||||
return self.exit_code
|
||||
|
||||
def do_diff(self, args):
|
||||
"""Diff contents of two archives"""
|
||||
def format_bytes(count):
|
||||
if count is None:
|
||||
return "<deleted>"
|
||||
return format_file_size(count)
|
||||
|
||||
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2):
|
||||
chunks1 = archive1.pipeline.fetch_many(chunk_ids1)
|
||||
chunks2 = archive2.pipeline.fetch_many(chunk_ids2)
|
||||
return self.compare_chunk_contents(chunks1, chunks2)
|
||||
|
||||
def get_owner(item):
|
||||
if args.numeric_owner:
|
||||
return item[b'uid'], item[b'gid']
|
||||
else:
|
||||
return item[b'user'], item[b'group']
|
||||
|
||||
def compare_items(path, item1, item2, deleted=False):
|
||||
"""
|
||||
Compare two items with identical paths.
|
||||
:param deleted: Whether one of the items has been deleted
|
||||
"""
|
||||
if not deleted:
|
||||
if item1[b'mode'] != item2[b'mode']:
|
||||
print(remove_surrogates(path), 'different mode')
|
||||
print('\t', args.location.archive, stat.filemode(item1[b'mode']))
|
||||
print('\t', args.archive2, stat.filemode(item2[b'mode']))
|
||||
|
||||
user1, group1 = get_owner(item1)
|
||||
user2, group2 = get_owner(item2)
|
||||
if user1 != user2 or group1 != group2:
|
||||
print(remove_surrogates(path), 'different owner')
|
||||
print('\t', args.location.archive, 'user=%s, group=%s' % (user1, group1))
|
||||
print('\t', args.archive2, 'user=%s, group=%s' % (user2, group2))
|
||||
|
||||
if not stat.S_ISREG(item1[b'mode']):
|
||||
return
|
||||
if b'chunks' not in item1 or b'chunks' not in item2:
|
||||
# At least one of the items is a link
|
||||
if item1.get(b'source') != item2.get(b'source'):
|
||||
print(remove_surrogates(path), 'different link')
|
||||
print('\t', args.location.archive, item1.get(b'source', '<regular file>'))
|
||||
print('\t', args.archive2, item2.get(b'source', '<regular file>'))
|
||||
return
|
||||
if deleted or not can_compare_chunk_ids or item1[b'chunks'] != item2[b'chunks']:
|
||||
# Contents are different
|
||||
chunk_ids1 = [c[0] for c in item1[b'chunks']]
|
||||
chunk_ids2 = [c[0] for c in item2[b'chunks']]
|
||||
chunk_id_set1 = set(chunk_ids1)
|
||||
chunk_id_set2 = set(chunk_ids2)
|
||||
total1 = None if item1.get(b'deleted') else sum(c[1] for c in item1[b'chunks'])
|
||||
total2 = None if item2.get(b'deleted') else sum(c[1] for c in item2[b'chunks'])
|
||||
if (not can_compare_chunk_ids and total1 == total2 and not deleted and
|
||||
fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)):
|
||||
return
|
||||
added = sum(c[1] for c in (chunk_id_set2 - chunk_id_set1))
|
||||
removed = sum(c[1] for c in (chunk_id_set1 - chunk_id_set2))
|
||||
print(remove_surrogates(path), 'different contents')
|
||||
print('\t +%s, -%s, %s, %s' % (format_bytes(added), format_bytes(removed),
|
||||
format_bytes(total1), format_bytes(total2)))
|
||||
|
||||
def compare_archives(archive1, archive2, matcher):
|
||||
orphans_archive1 = {}
|
||||
orphans_archive2 = {}
|
||||
for item1, item2 in zip_longest(
|
||||
archive1.iter_items(lambda item: matcher.match(item[b'path'])),
|
||||
archive2.iter_items(lambda item: matcher.match(item[b'path'])),
|
||||
):
|
||||
if item1 and item2 and item1[b'path'] == item2[b'path']:
|
||||
compare_items(item1[b'path'], item1, item2)
|
||||
continue
|
||||
if item1:
|
||||
matching_orphan = orphans_archive2.pop(item1[b'path'], None)
|
||||
if matching_orphan:
|
||||
compare_items(item1[b'path'], item1, matching_orphan)
|
||||
else:
|
||||
orphans_archive1[item1[b'path']] = item1
|
||||
if item2:
|
||||
matching_orphan = orphans_archive1.pop(item2[b'path'], None)
|
||||
if matching_orphan:
|
||||
compare_items(item2[b'path'], matching_orphan, item2)
|
||||
else:
|
||||
orphans_archive2[item2[b'path']] = item2
|
||||
# At this point orphans_* contain items that had no matching partner in the other archive
|
||||
for added in orphans_archive2.values():
|
||||
compare_items(added[b'path'], {
|
||||
b'deleted': True,
|
||||
b'chunks': [],
|
||||
}, added, deleted=True)
|
||||
for deleted in orphans_archive1.values():
|
||||
compare_items(deleted[b'path'], deleted, {
|
||||
b'deleted': True,
|
||||
b'chunks': [],
|
||||
}, deleted=True)
|
||||
|
||||
repository = self.open_repository(args)
|
||||
manifest, key = Manifest.load(repository)
|
||||
archive1 = Archive(repository, key, manifest, args.location.archive)
|
||||
archive2 = Archive(repository, key, manifest, args.archive2)
|
||||
|
||||
can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get(
|
||||
b'chunker_params', True) or args.same_chunker_params
|
||||
if not can_compare_chunk_ids:
|
||||
self.print_warning('--chunker-params might be different between archives, diff will be slow.\n'
|
||||
'If you know for certain that they are the same, pass --same-chunker-params '
|
||||
'to override this check.')
|
||||
|
||||
matcher, include_patterns = self.build_matcher(args.excludes, args.paths)
|
||||
|
||||
compare_archives(archive1, archive2, matcher)
|
||||
|
||||
for pattern in include_patterns:
|
||||
if pattern.match_count == 0:
|
||||
self.print_warning("Include pattern '%s' never matched.", pattern)
|
||||
return self.exit_code
|
||||
|
||||
def do_rename(self, args):
|
||||
"""Rename an existing archive"""
|
||||
repository = self.open_repository(args, exclusive=True)
|
||||
|
@ -649,7 +796,7 @@ def do_debug_put_obj(self, args):
|
|||
for path in args.paths:
|
||||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
h = sha256(data) # XXX hardcoded
|
||||
h = hashlib.sha256(data) # XXX hardcoded
|
||||
repository.put(h.digest(), data)
|
||||
print("object %s put." % h.hexdigest())
|
||||
repository.commit()
|
||||
|
@ -1085,6 +1232,41 @@ def build_parser(self, args=None, prog=None):
|
|||
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
|
||||
help='paths to extract; patterns are supported')
|
||||
|
||||
diff_epilog = textwrap.dedent("""
|
||||
This command finds differences in files (contents, user, group, mode) between archives.
|
||||
|
||||
Both archives need to be in the same repository, and a repository location may only
|
||||
be specified for ARCHIVE1.
|
||||
|
||||
See the output of the "borg help patterns" command for more help on exclude patterns.
|
||||
""")
|
||||
subparser = subparsers.add_parser('diff', parents=[common_parser],
|
||||
description=self.do_diff.__doc__,
|
||||
epilog=diff_epilog,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
help='find differences in archive contents')
|
||||
subparser.set_defaults(func=self.do_diff)
|
||||
subparser.add_argument('-e', '--exclude', dest='excludes',
|
||||
type=parse_pattern, action='append',
|
||||
metavar="PATTERN", help='exclude paths matching PATTERN')
|
||||
subparser.add_argument('--exclude-from', dest='exclude_files',
|
||||
type=argparse.FileType('r'), action='append',
|
||||
metavar='EXCLUDEFILE', help='read exclude patterns from EXCLUDEFILE, one per line')
|
||||
subparser.add_argument('--numeric-owner', dest='numeric_owner',
|
||||
action='store_true', default=False,
|
||||
help='only obey numeric user and group identifiers')
|
||||
subparser.add_argument('--same-chunker-params', dest='same_chunker_params',
|
||||
action='store_true', default=False,
|
||||
help='Override check of chunker parameters.')
|
||||
subparser.add_argument('location', metavar='ARCHIVE1',
|
||||
type=location_validator(archive=True),
|
||||
help='archive')
|
||||
subparser.add_argument('archive2', metavar='ARCHIVE2',
|
||||
type=str,
|
||||
help='archive to compare with ARCHIVE1 (no repository location)')
|
||||
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
|
||||
help='paths to compare; patterns are supported')
|
||||
|
||||
rename_epilog = textwrap.dedent("""
|
||||
This command renames an archive in the repository.
|
||||
""")
|
||||
|
|
|
@ -1143,6 +1143,43 @@ def test_debug_put_get_delete_obj(self):
|
|||
pass
|
||||
|
||||
|
||||
class DiffArchiverTestCase(ArchiverTestCaseBase):
|
||||
create_test_files = ArchiverTestCase.create_test_files
|
||||
create_regular_file = ArchiverTestCase.create_regular_file
|
||||
|
||||
def test_basic_functionality(self):
|
||||
self.create_test_files()
|
||||
self.cmd('init', self.repository_location)
|
||||
os.chmod('input/dir2', stat.S_IFDIR | 0o755)
|
||||
self.create_regular_file('file3', size=1024)
|
||||
self.cmd('create', self.repository_location + '::test0', 'input')
|
||||
# replace 'hardlink' with a file
|
||||
os.unlink('input/hardlink')
|
||||
self.create_regular_file('hardlink', size=1024 * 80)
|
||||
# replace directory with a file
|
||||
os.unlink('input/dir2/file2')
|
||||
os.rmdir('input/dir2')
|
||||
self.create_regular_file('dir2', size=1024 * 80)
|
||||
os.chmod('input/dir2', stat.S_IFREG | 0o755)
|
||||
self.create_regular_file('file3', size=1024, contents=b'0')
|
||||
self.cmd('create', self.repository_location + '::test1a', 'input')
|
||||
self.cmd('create', '--chunker-params', '16,18,17,4095', self.repository_location + '::test1b', 'input')
|
||||
|
||||
def do_asserts(output, archive):
|
||||
assert 'input/file3 different contents' in output
|
||||
assert 'input/hardlink different mode' in output
|
||||
assert ('input/hardlink different link\n'
|
||||
' test0 input/file1\n'
|
||||
' test%s <regular file>' % archive) in output
|
||||
assert ('input/dir2 different mode\n'
|
||||
' test0 drwxr-xr-x\n'
|
||||
' test%s -rwxr-xr-x\n' % archive) in output
|
||||
assert 'input/dir2/file2 different contents' in output
|
||||
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1a'), '1a')
|
||||
# We expect exit_code=1 due to the chunker params warning
|
||||
do_asserts(self.cmd('diff', self.repository_location + '::test0', 'test1b', exit_code=1), '1b')
|
||||
|
||||
|
||||
def test_get_args():
|
||||
archiver = Archiver()
|
||||
# everything normal:
|
||||
|
@ -1162,3 +1199,34 @@ def test_get_args():
|
|||
args = archiver.get_args(['borg', 'serve', '--restrict-to-path=/p1', '--restrict-to-path=/p2', ],
|
||||
'borg init /')
|
||||
assert args.func == archiver.do_serve
|
||||
|
||||
|
||||
def test_compare_chunk_contents():
|
||||
def ccc(a, b):
|
||||
compare1 = Archiver.compare_chunk_contents(iter(a), iter(b))
|
||||
compare2 = Archiver.compare_chunk_contents(iter(b), iter(a))
|
||||
assert compare1 == compare2
|
||||
return compare1
|
||||
assert ccc([
|
||||
b'1234', b'567A', b'bC'
|
||||
], [
|
||||
b'1', b'23', b'4567A', b'b', b'C'
|
||||
])
|
||||
# one iterator exhausted before the other
|
||||
assert not ccc([
|
||||
b'12345',
|
||||
], [
|
||||
b'1234', b'56'
|
||||
])
|
||||
# content mismatch
|
||||
assert not ccc([
|
||||
b'1234', b'65'
|
||||
], [
|
||||
b'1234', b'56'
|
||||
])
|
||||
# first is the prefix of second
|
||||
assert not ccc([
|
||||
b'1234', b'56'
|
||||
], [
|
||||
b'1234', b'565'
|
||||
])
|
||||
|
|
Loading…
Reference in a new issue