Merge pull request #2842 from imsodin/diffmount-diff_refactor

Refactor diff functionality
This commit is contained in:
enkore 2017-08-21 13:49:50 +02:00 committed by GitHub
commit 5fb2b4b0a6
4 changed files with 267 additions and 229 deletions

View File

@ -5,12 +5,13 @@ import socket
import stat
import sys
import time
from collections import OrderedDict
from contextlib import contextmanager
from datetime import datetime, timezone, timedelta
from functools import partial
from getpass import getuser
from io import BytesIO
from itertools import groupby
from itertools import groupby, zip_longest
from shutil import get_terminal_size
import msgpack
@ -40,7 +41,7 @@ from .helpers import bin_to_hex
from .helpers import safe_ns
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
from .item import Item, ArchiveItem
from .item import Item, ArchiveItem, ItemDiff
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
from .remote import cache_if_remote
from .repository import Repository, LIST_SCAN_LIMIT
@ -819,6 +820,89 @@ Utilization of max. archive size: {csize_max:.0%}
# Was this EPERM due to the O_NOATIME flag? Try again without it:
return os.open(path, flags_normal)
@staticmethod
def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_ids=False):
"""
Yields tuples with a path and an ItemDiff instance describing changes/indicating equality.
:param matcher: PatternMatcher class to restrict results to only matching paths.
:param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
"""
def hardlink_master_seen(item):
return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
def is_hardlink_master(item):
return item.get('hardlink_master', True) and 'source' not in item
def update_hardlink_masters(item1, item2):
if is_hardlink_master(item1) or is_hardlink_master(item2):
hardlink_masters[item1.path] = (item1, item2)
def has_hardlink_master(item, hardlink_masters):
return hardlinkable(item.mode) and item.get('source') in hardlink_masters
def compare_items(item1, item2):
if has_hardlink_master(item1, hardlink_masters):
item1 = hardlink_masters[item1.source][0]
if has_hardlink_master(item2, hardlink_masters):
item2 = hardlink_masters[item2.source][1]
return ItemDiff(item1, item2,
archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]),
archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]),
can_compare_chunk_ids=can_compare_chunk_ids)
def defer_if_necessary(item1, item2):
"""Adds item tuple to deferred if necessary and returns True, if items were deferred"""
update_hardlink_masters(item1, item2)
defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2)
if defer:
deferred.append((item1, item2))
return defer
orphans_archive1 = OrderedDict()
orphans_archive2 = OrderedDict()
deferred = []
hardlink_masters = {}
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item.path)),
archive2.iter_items(lambda item: matcher.match(item.path)),
):
if item1 and item2 and item1.path == item2.path:
if not defer_if_necessary(item1, item2):
yield (item1.path, compare_items(item1, item2))
continue
if item1:
matching_orphan = orphans_archive2.pop(item1.path, None)
if matching_orphan:
if not defer_if_necessary(item1, matching_orphan):
yield (item1.path, compare_items(item1, matching_orphan))
else:
orphans_archive1[item1.path] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2.path, None)
if matching_orphan:
if not defer_if_necessary(matching_orphan, item2):
yield (matching_orphan.path, compare_items(matching_orphan, item2))
else:
orphans_archive2[item2.path] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
for added in orphans_archive2.values():
path = added.path
deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted_item, added)
yield (path, compare_items(deleted_item, added))
for deleted in orphans_archive1.values():
path = deleted.path
deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted, deleted_item)
yield (path, compare_items(deleted, deleted_item))
for item1, item2 in deferred:
assert hardlink_master_seen(item1)
assert hardlink_master_seen(item2)
yield (path, compare_items(item1, item2))
class MetadataCollector:
def __init__(self, *, noatime, noctime, numeric_owner):

View File

@ -195,33 +195,6 @@ class Archiver:
else:
logging.getLogger('borg.output.list').info("%1s %s", status, remove_surrogates(path))
@staticmethod
def compare_chunk_contents(chunks1, chunks2):
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)"""
end = object()
alen = ai = 0
blen = bi = 0
while True:
if not alen - ai:
a = next(chunks1, end)
if a is end:
return not blen - bi and next(chunks2, end) is end
a = memoryview(a)
alen = len(a)
ai = 0
if not blen - bi:
b = next(chunks2, end)
if b is end:
return not alen - ai and next(chunks1, end) is end
b = memoryview(b)
blen = len(b)
bi = 0
slicelen = min(alen - ai, blen - bi)
if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
return False
ai += slicelen
bi += slicelen
@staticmethod
def build_matcher(inclexcl_patterns, include_paths):
matcher = PatternMatcher()
@ -967,195 +940,9 @@ class Archiver:
@with_archive
def do_diff(self, args, repository, manifest, key, archive):
"""Diff contents of two archives"""
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2):
chunks1 = archive1.pipeline.fetch_many(chunk_ids1)
chunks2 = archive2.pipeline.fetch_many(chunk_ids2)
return self.compare_chunk_contents(chunks1, chunks2)
def sum_chunk_size(item, consider_ids=None):
if item.get('deleted'):
size = None
else:
if consider_ids is not None: # consider only specific chunks
size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids)
else: # consider all chunks
size = item.get_size()
return size
def get_owner(item):
if args.numeric_owner:
return item.uid, item.gid
else:
return item.user, item.group
def get_mode(item):
if 'mode' in item:
return stat.filemode(item.mode)
else:
return [None]
def has_hardlink_master(item, hardlink_masters):
return hardlinkable(item.mode) and item.get('source') in hardlink_masters
def compare_link(item1, item2):
# These are the simple link cases. For special cases, e.g. if a
# regular file is replaced with a link or vice versa, it is
# indicated in compare_mode instead.
if item1.get('deleted'):
return 'added link'
elif item2.get('deleted'):
return 'removed link'
elif 'source' in item1 and 'source' in item2 and item1.source != item2.source:
return 'changed link'
def contents_changed(item1, item2):
if can_compare_chunk_ids:
return item1.chunks != item2.chunks
else:
if sum_chunk_size(item1) != sum_chunk_size(item2):
return True
else:
chunk_ids1 = [c.id for c in item1.chunks]
chunk_ids2 = [c.id for c in item2.chunks]
return not fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)
def compare_content(path, item1, item2):
if contents_changed(item1, item2):
if item1.get('deleted'):
return 'added {:>13}'.format(format_file_size(sum_chunk_size(item2)))
if item2.get('deleted'):
return 'removed {:>11}'.format(format_file_size(sum_chunk_size(item1)))
if not can_compare_chunk_ids:
return 'modified'
chunk_ids1 = {c.id for c in item1.chunks}
chunk_ids2 = {c.id for c in item2.chunks}
added_ids = chunk_ids2 - chunk_ids1
removed_ids = chunk_ids1 - chunk_ids2
added = sum_chunk_size(item2, added_ids)
removed = sum_chunk_size(item1, removed_ids)
return '{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True))
def compare_directory(item1, item2):
if item2.get('deleted') and not item1.get('deleted'):
return 'removed directory'
elif item1.get('deleted') and not item2.get('deleted'):
return 'added directory'
def compare_owner(item1, item2):
user1, group1 = get_owner(item1)
user2, group2 = get_owner(item2)
if user1 != user2 or group1 != group2:
return '[{}:{} -> {}:{}]'.format(user1, group1, user2, group2)
def compare_mode(item1, item2):
if item1.mode != item2.mode:
return '[{} -> {}]'.format(get_mode(item1), get_mode(item2))
def compare_items(output, path, item1, item2, hardlink_masters, deleted=False):
"""
Compare two items with identical paths.
:param deleted: Whether one of the items has been deleted
"""
changes = []
if has_hardlink_master(item1, hardlink_masters):
item1 = hardlink_masters[item1.source][0]
if has_hardlink_master(item2, hardlink_masters):
item2 = hardlink_masters[item2.source][1]
if get_mode(item1)[0] == 'l' or get_mode(item2)[0] == 'l':
changes.append(compare_link(item1, item2))
if 'chunks' in item1 and 'chunks' in item2:
changes.append(compare_content(path, item1, item2))
if get_mode(item1)[0] == 'd' or get_mode(item2)[0] == 'd':
changes.append(compare_directory(item1, item2))
if not deleted:
changes.append(compare_owner(item1, item2))
changes.append(compare_mode(item1, item2))
changes = [x for x in changes if x]
if changes:
output_line = (remove_surrogates(path), ' '.join(changes))
if args.sort:
output.append(output_line)
else:
print_output(output_line)
def print_output(line):
print("{:<19} {}".format(line[1], line[0]))
def compare_archives(archive1, archive2, matcher):
def hardlink_master_seen(item):
return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
def is_hardlink_master(item):
return item.get('hardlink_master', True) and 'source' not in item
def update_hardlink_masters(item1, item2):
if is_hardlink_master(item1) or is_hardlink_master(item2):
hardlink_masters[item1.path] = (item1, item2)
def compare_or_defer(item1, item2):
update_hardlink_masters(item1, item2)
if not hardlink_master_seen(item1) or not hardlink_master_seen(item2):
deferred.append((item1, item2))
else:
compare_items(output, item1.path, item1, item2, hardlink_masters)
orphans_archive1 = collections.OrderedDict()
orphans_archive2 = collections.OrderedDict()
deferred = []
hardlink_masters = {}
output = []
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item.path)),
archive2.iter_items(lambda item: matcher.match(item.path)),
):
if item1 and item2 and item1.path == item2.path:
compare_or_defer(item1, item2)
continue
if item1:
matching_orphan = orphans_archive2.pop(item1.path, None)
if matching_orphan:
compare_or_defer(item1, matching_orphan)
else:
orphans_archive1[item1.path] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2.path, None)
if matching_orphan:
compare_or_defer(matching_orphan, item2)
else:
orphans_archive2[item2.path] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
deleted_item = Item(
deleted=True,
chunks=[],
mode=0,
)
for added in orphans_archive2.values():
path = added.path
deleted_item.path = path
update_hardlink_masters(deleted_item, added)
compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True)
for deleted in orphans_archive1.values():
path = deleted.path
deleted_item.path = path
update_hardlink_masters(deleted, deleted_item)
compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True)
for item1, item2 in deferred:
assert hardlink_master_seen(item1)
assert hardlink_master_seen(item2)
compare_items(output, item1.path, item1, item2, hardlink_masters)
for line in sorted(output):
print_output(line)
def print_output(diff, path):
print("{:<19} {}".format(diff, path))
archive1 = archive
archive2 = Archive(repository, key, manifest, args.archive2,
@ -1170,7 +957,15 @@ class Archiver:
matcher = self.build_matcher(args.patterns, args.paths)
compare_archives(archive1, archive2, matcher)
diffs = Archive.compare_archives_iter(archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids)
# Conversion to string and filtering for diff.equal to save memory if sorting
diffs = ((path, str(diff)) for path, diff in diffs if not diff.equal)
if args.sort:
diffs = sorted(diffs)
for path, diff in diffs:
print_output(diff, path)
for pattern in matcher.get_unmatched_include_patterns():
self.print_warning("Include pattern '%s' never matched.", pattern)

View File

@ -5,6 +5,7 @@ from .constants import ITEM_KEYS
from .helpers import safe_encode, safe_decode
from .helpers import bigint_to_int, int_to_bigint
from .helpers import StableDict
from .helpers import format_file_size
cdef extern from "_item.c":
object _object_to_optr(object obj)
@ -184,19 +185,22 @@ class Item(PropDict):
part = PropDict._make_property('part', int)
def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False):
def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None):
"""
Determine the (uncompressed or compressed) size of this item.
For hardlink slaves, the size is computed via the hardlink master's
chunk list, if available (otherwise size will be returned as 0).
If memorize is True, the computed size value will be stored into the item.
:param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list,
otherwise size will be returned as 0.
:param memorize: Whether the computed size value will be stored into the item.
:param compressed: Whether the compressed or uncompressed size will be returned.
:param from_chunks: If true, size is computed from chunks even if a precomputed value is available.
:param consider_ids: Returns the size of the given ids only.
"""
attr = 'csize' if compressed else 'size'
assert not (compressed and memorize), 'Item does not have a csize field.'
assert not (consider_ids is not None and memorize), "Can't store size when considering only certain ids"
try:
if from_chunks:
if from_chunks or consider_ids is not None:
raise AttributeError
size = getattr(self, attr)
except AttributeError:
@ -226,7 +230,10 @@ class Item(PropDict):
chunks, _ = hardlink_masters.get(master, (None, None))
if chunks is None:
return 0
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
if consider_ids is not None:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids)
else:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
# if requested, memorize the precomputed (c)size for items that have an own chunks list:
if memorize and having_chunks:
setattr(self, attr, size)
@ -251,6 +258,21 @@ class Item(PropDict):
def from_optr(self, optr):
return _optr_to_object(optr)
@classmethod
def create_deleted(cls, path):
return cls(deleted=True, chunks=[], mode=0, path=path)
def is_link(self):
return self._is_type(stat.S_ISLNK)
def is_dir(self):
return self._is_type(stat.S_ISDIR)
def _is_type(self, typetest):
try:
return typetest(self.mode)
except AttributeError:
return False
class EncryptedKey(PropDict):
@ -358,3 +380,140 @@ class ManifestItem(PropDict):
timestamp = PropDict._make_property('timestamp', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode)
config = PropDict._make_property('config', dict)
item_keys = PropDict._make_property('item_keys', tuple)
class ItemDiff:
"""
Comparison of two items from different archives.
The items may have different paths and still be considered equal (e.g. for renames).
It does not include extended or time attributes in the comparison.
"""
def __init__(self, item1, item2, chunk_iterator1, chunk_iterator2, numeric_owner=False, can_compare_chunk_ids=False):
self._item1 = item1
self._item2 = item2
self._numeric_owner = numeric_owner
self._can_compare_chunk_ids = can_compare_chunk_ids
self.equal = self._equal(chunk_iterator1, chunk_iterator2)
def __repr__(self):
if self.equal:
return 'equal'
changes = []
if self._item1.is_link() or self._item2.is_link():
changes.append(self._link_string())
if 'chunks' in self._item1 and 'chunks' in self._item2:
changes.append(self._content_string())
if self._item1.is_dir() or self._item2.is_dir():
changes.append(self._dir_string())
if not (self._item1.get('deleted') or self._item2.get('deleted')):
changes.append(self._owner_string())
changes.append(self._mode_string())
return ' '.join((x for x in changes if x))
def _equal(self, chunk_iterator1, chunk_iterator2):
# if both are deleted, there is nothing at path regardless of what was deleted
if self._item1.get('deleted') and self._item2.get('deleted'):
return True
attr_list = ['deleted', 'mode', 'source']
attr_list += ['uid', 'gid'] if self._numeric_owner else ['user', 'group']
for attr in attr_list:
if self._item1.get(attr) != self._item2.get(attr):
return False
if 'mode' in self._item1: # mode of item1 and item2 is equal
if (self._item1.is_link() and 'source' in self._item1 and 'source' in self._item2
and self._item1.source != self._item2.source):
return False
if 'chunks' in self._item1 and 'chunks' in self._item2:
return self._content_equal(chunk_iterator1, chunk_iterator2)
return True
def _link_string(self):
if self._item1.get('deleted'):
return 'added link'
if self._item2.get('deleted'):
return 'removed link'
if 'source' in self._item1 and 'source' in self._item2 and self._item1.source != self._item2.source:
return 'changed link'
def _content_string(self):
if self._item1.get('deleted'):
return ('added {:>13}'.format(format_file_size(self._item2.get_size())))
if self._item2.get('deleted'):
return ('removed {:>11}'.format(format_file_size(self._item1.get_size())))
if not self._can_compare_chunk_ids:
return 'modified'
chunk_ids1 = {c.id for c in self._item1.chunks}
chunk_ids2 = {c.id for c in self._item2.chunks}
added_ids = chunk_ids2 - chunk_ids1
removed_ids = chunk_ids1 - chunk_ids2
added = self._item2.get_size(consider_ids=added_ids)
removed = self._item1.get_size(consider_ids=removed_ids)
return ('{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True)))
def _dir_string(self):
if self._item2.get('deleted') and not self._item1.get('deleted'):
return 'removed directory'
if self._item1.get('deleted') and not self._item2.get('deleted'):
return 'added directory'
def _owner_string(self):
u_attr, g_attr = ('uid', 'gid') if self._numeric_owner else ('user', 'group')
u1, g1 = self._item1.get(u_attr), self._item1.get(g_attr)
u2, g2 = self._item2.get(u_attr), self._item2.get(g_attr)
if (u1, g1) != (u2, g2):
return '[{}:{} -> {}:{}]'.format(u1, g1, u2, g2)
def _mode_string(self):
if 'mode' in self._item1 and 'mode' in self._item2 and self._item1.mode != self._item2.mode:
return '[{} -> {}]'.format(stat.filemode(self._item1.mode), stat.filemode(self._item2.mode))
def _content_equal(self, chunk_iterator1, chunk_iterator2):
if self._can_compare_chunk_ids:
return self._item1.chunks == self._item2.chunks
if self._item1.get_size() != self._item2.get_size():
return False
return ItemDiff._chunk_content_equal(chunk_iterator1, chunk_iterator2)
@staticmethod
def _chunk_content_equal(chunks1, chunks2):
"""
Compare chunk content and return True if they are identical.
The chunks must be given as chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`).
"""
end = object()
alen = ai = 0
blen = bi = 0
while True:
if not alen - ai:
a = next(chunks1, end)
if a is end:
return not blen - bi and next(chunks2, end) is end
a = memoryview(a)
alen = len(a)
ai = 0
if not blen - bi:
b = next(chunks2, end)
if b is end:
return not alen - ai and next(chunks1, end) is end
b = memoryview(b)
blen = len(b)
bi = 0
slicelen = min(alen - ai, blen - bi)
if a[ai:ai + slicelen] != b[bi:bi + slicelen]:
return False
ai += slicelen
bi += slicelen

View File

@ -47,7 +47,7 @@ from ..helpers import bin_to_hex
from ..helpers import MAX_S
from ..nanorst import RstToTextLazy, rst_to_terminal
from ..patterns import IECommand, PatternMatcher, parse_pattern
from ..item import Item
from ..item import Item, ItemDiff
from ..logger import setup_logging
from ..remote import RemoteRepository, PathNotAllowed
from ..repository import Repository
@ -3430,12 +3430,12 @@ def test_get_args():
assert args.func == archiver.do_serve
def test_compare_chunk_contents():
def test_chunk_content_equal():
def ccc(a, b):
chunks_a = [data for data in a]
chunks_b = [data for data in b]
compare1 = Archiver.compare_chunk_contents(iter(chunks_a), iter(chunks_b))
compare2 = Archiver.compare_chunk_contents(iter(chunks_b), iter(chunks_a))
compare1 = ItemDiff._chunk_content_equal(iter(chunks_a), iter(chunks_b))
compare2 = ItemDiff._chunk_content_equal(iter(chunks_b), iter(chunks_a))
assert compare1 == compare2
return compare1
assert ccc([