Refactor the diff functionality

This factors out a lot of the logic in do_diff in archiver.py to Archive in
archive.py and a new class ItemDiff in item.pyx. The idea is to move methods
to the classes that are affected and to make it reusable, primarily for a new
option to fuse (#2475).
This commit is contained in:
Simon Frei 2017-07-19 14:29:14 +02:00
parent 37f75519cf
commit 9dc22d230f
4 changed files with 174 additions and 136 deletions

View File

@ -5,12 +5,13 @@ import socket
import stat
import sys
import time
from collections import OrderedDict
from contextlib import contextmanager
from datetime import datetime, timezone, timedelta
from functools import partial
from getpass import getuser
from io import BytesIO
from itertools import groupby
from itertools import groupby, zip_longest
from shutil import get_terminal_size
import msgpack
@ -40,7 +41,7 @@ from .helpers import bin_to_hex
from .helpers import safe_ns
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
from .item import Item, ArchiveItem
from .item import Item, ArchiveItem, ItemDiff
from .platform import acl_get, acl_set, set_flags, get_flags, swidth
from .remote import cache_if_remote
from .repository import Repository, LIST_SCAN_LIMIT
@ -819,34 +820,14 @@ Utilization of max. archive size: {csize_max:.0%}
# Was this EPERM due to the O_NOATIME flag? Try again without it:
return os.open(path, flags_normal)
def compare_archives(archive1, archive2, matcher):
@staticmethod
def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_ids=False):
"""
Yields tuples with a path and an ItemDiff instance describing changes/indicating equality.
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2):
chunks1 = archive1.pipeline.fetch_many(chunk_ids1)
chunks2 = archive2.pipeline.fetch_many(chunk_ids2)
return self.compare_chunk_contents(chunks1, chunks2)
def sum_chunk_size(item, consider_ids=None):
if item.get('deleted'):
size = None
else:
if consider_ids is not None: # consider only specific chunks
size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids)
else: # consider all chunks
size = item.get_size()
return size
def get_owner(item):
if args.numeric_owner:
return item.uid, item.gid
else:
return item.user, item.group
def get_mode(item):
if 'mode' in item:
return stat.filemode(item.mode)
else:
return [None]
:param matcher: PatternMatcher class to restrict results to only matching paths.
:param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
"""
def hardlink_master_seen(item):
return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
@ -861,93 +842,66 @@ Utilization of max. archive size: {csize_max:.0%}
def has_hardlink_master(item, hardlink_masters):
return hardlinkable(item.mode) and item.get('source') in hardlink_masters
def compare_items(output, path, item1, item2, hardlink_masters, deleted=False):
"""
Compare two items with identical paths.
:param deleted: Whether one of the items has been deleted
"""
changes = []
def compare_items(item1, item2):
if has_hardlink_master(item1, hardlink_masters):
item1 = hardlink_masters[item1.source][0]
if has_hardlink_master(item2, hardlink_masters):
item2 = hardlink_masters[item2.source][1]
return ItemDiff(item1, item2,
archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]),
archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]),
can_compare_chunk_ids=can_compare_chunk_ids)
if get_mode(item1)[0] == 'l' or get_mode(item2)[0] == 'l':
changes.append(compare_link(item1, item2))
if 'chunks' in item1 and 'chunks' in item2:
changes.append(compare_content(path, item1, item2))
if get_mode(item1)[0] == 'd' or get_mode(item2)[0] == 'd':
changes.append(compare_directory(item1, item2))
if not deleted:
changes.append(compare_owner(item1, item2))
changes.append(compare_mode(item1, item2))
changes = [x for x in changes if x]
if changes:
output_line = (remove_surrogates(path), ' '.join(changes))
if args.sort:
output.append(output_line)
else:
print_output(output_line)
def compare_or_defer(item1, item2):
def defer_if_necessary(item1, item2):
"""Adds item tuple to deferred if necessary and returns True, if items were deferred"""
update_hardlink_masters(item1, item2)
if not hardlink_master_seen(item1) or not hardlink_master_seen(item2):
defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2)
if defer:
deferred.append((item1, item2))
else:
compare_items(output, item1.path, item1, item2, hardlink_masters)
return defer
orphans_archive1 = collections.OrderedDict()
orphans_archive2 = collections.OrderedDict()
orphans_archive1 = OrderedDict()
orphans_archive2 = OrderedDict()
deferred = []
hardlink_masters = {}
output = []
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item.path)),
archive2.iter_items(lambda item: matcher.match(item.path)),
):
if item1 and item2 and item1.path == item2.path:
compare_or_defer(item1, item2)
if not defer_if_necessary(item1, item2):
yield (item1.path, compare_items(item1, item2))
continue
if item1:
matching_orphan = orphans_archive2.pop(item1.path, None)
if matching_orphan:
compare_or_defer(item1, matching_orphan)
if not defer_if_necessary(item1, matching_orphan):
yield (item1.path, compare_items(item1, matching_orphan))
else:
orphans_archive1[item1.path] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2.path, None)
if matching_orphan:
compare_or_defer(matching_orphan, item2)
if not defer_if_necessary(matching_orphan, item2):
yield (matching_orphan.path, compare_items(matching_orphan, item2))
else:
orphans_archive2[item2.path] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
deleted_item = Item(
deleted=True,
chunks=[],
mode=0,
)
for added in orphans_archive2.values():
path = added.path
deleted_item.path = path
deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted_item, added)
compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True)
yield (path, compare_items(deleted_item, added))
for deleted in orphans_archive1.values():
path = deleted.path
deleted_item.path = path
deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted, deleted_item)
compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True)
yield (path, compare_items(deleted, deleted_item))
for item1, item2 in deferred:
assert hardlink_master_seen(item1)
assert hardlink_master_seen(item2)
compare_items(output, item1.path, item1, item2, hardlink_masters)
yield (path, compare_items(item1, item2))
class MetadataCollector:

View File

@ -940,8 +940,8 @@ class Archiver:
def do_diff(self, args, repository, manifest, key, archive):
"""Diff contents of two archives"""
def print_output(line):
print("{:<19} {}".format(line[1], line[0]))
def print_output(diff, path):
print("{:<19} {}".format(diff, path))
archive1 = archive
archive2 = Archive(repository, key, manifest, args.archive2,
@ -956,10 +956,15 @@ class Archiver:
matcher = self.build_matcher(args.patterns, args.paths)
compare_archives(archive1, archive2, matcher)
diffs = Archive.compare_archives_iter(archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids)
# Conversion to string and filtering for diff.equal to save memory if sorting
diffs = ((path, str(diff)) for path, diff in diffs if not diff.equal)
for line in sorted(output):
print_output(line)
if args.sort:
diffs = sorted(diffs)
for path, diff in diffs:
print_output(diff, path)
for pattern in matcher.get_unmatched_include_patterns():
self.print_warning("Include pattern '%s' never matched.", pattern)

View File

@ -5,6 +5,7 @@ from .constants import ITEM_KEYS
from .helpers import safe_encode, safe_decode
from .helpers import bigint_to_int, int_to_bigint
from .helpers import StableDict
from .helpers import format_file_size
cdef extern from "_item.c":
object _object_to_optr(object obj)
@ -184,19 +185,22 @@ class Item(PropDict):
part = PropDict._make_property('part', int)
def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False):
def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None):
"""
Determine the (uncompressed or compressed) size of this item.
For hardlink slaves, the size is computed via the hardlink master's
chunk list, if available (otherwise size will be returned as 0).
If memorize is True, the computed size value will be stored into the item.
:param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list,
otherwise size will be returned as 0.
:param memorize: Whether the computed size value will be stored into the item.
:param compressed: Whether the compressed or uncompressed size will be returned.
:param from_chunks: If true, size is computed from chunks even if a precomputed value is available.
:param consider_ids: Returns the size of the given ids only.
"""
attr = 'csize' if compressed else 'size'
assert not (compressed and memorize), 'Item does not have a csize field.'
assert not (consider_ids is not None and memorize), "Can't store size when considering only certain ids"
try:
if from_chunks:
if from_chunks or consider_ids is not None:
raise AttributeError
size = getattr(self, attr)
except AttributeError:
@ -226,7 +230,10 @@ class Item(PropDict):
chunks, _ = hardlink_masters.get(master, (None, None))
if chunks is None:
return 0
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
if consider_ids is not None:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids)
else:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
# if requested, memorize the precomputed (c)size for items that have an own chunks list:
if memorize and having_chunks:
setattr(self, attr, size)
@ -251,6 +258,21 @@ class Item(PropDict):
def from_optr(self, optr):
return _optr_to_object(optr)
@classmethod
def create_deleted(cls, path):
return cls(deleted=True, chunks=[], mode=0, path=path)
def is_link(self):
return self._is_type(stat.S_ISLNK)
def is_dir(self):
return self._is_type(stat.S_ISDIR)
def _is_type(self, typetest):
try:
return typetest(self.mode)
except AttributeError:
return False
class EncryptedKey(PropDict):
@ -359,62 +381,119 @@ class ManifestItem(PropDict):
config = PropDict._make_property('config', dict)
item_keys = PropDict._make_property('item_keys', tuple)
def compare_link(item1, item2):
# These are the simple link cases. For special cases, e.g. if a
# regular file is replaced with a link or vice versa, it is
# indicated in compare_mode instead.
if item1.get('deleted'):
class ItemDiff:
"""
Comparison of two items from different archives.
The items may have different paths and still be considered equal (e.g. for renames).
It does not include extended or time attributes in the comparison.
"""
def __init__(self, item1, item2, chunk_iterator1, chunk_iterator2, numeric_owner=False, can_compare_chunk_ids=False):
self._item1 = item1
self._item2 = item2
self._numeric_owner = numeric_owner
self._can_compare_chunk_ids = can_compare_chunk_ids
self.equal = self._equal(chunk_iterator1, chunk_iterator2)
def __repr__(self):
if self.equal:
return 'equal'
changes = []
if self._item1.is_link() or self._item2.is_link():
changes.append(self._link_string())
if 'chunks' in self._item1 and 'chunks' in self._item2:
changes.append(self._content_string())
if self._item1.is_dir() or self._item2.is_dir():
changes.append(self._dir_string())
if not (self._item1.get('deleted') or self._item2.get('deleted')):
changes.append(self._owner_string())
changes.append(self._mode_string())
return ' '.join((x for x in changes if x))
def _equal(self, chunk_iterator1, chunk_iterator2):
# if both are deleted, there is nothing at path regardless of what was deleted
if self._item1.get('deleted') and self._item2.get('deleted'):
return True
attr_list = ['deleted', 'mode', 'source']
attr_list += ['uid', 'gid'] if self._numeric_owner else ['user', 'group']
for attr in attr_list:
if self._item1.get(attr) != self._item2.get(attr):
return False
if 'mode' in self._item1: # mode of item1 and item2 is equal
if (self._item1.is_link() and 'source' in self._item1 and 'source' in self._item2
and self._item1.source != self._item2.source):
return False
if 'chunks' in self._item1 and 'chunks' in self._item2:
return self._content_equal(chunk_iterator1, chunk_iterator2)
return True
def _link_string(self):
if self._item1.get('deleted'):
return 'added link'
if item2.get('deleted'):
if self._item2.get('deleted'):
return 'removed link'
if 'source' in item1 and 'source' in item2 and item1.source != item2.source:
if 'source' in self._item1 and 'source' in self._item2 and self._item1.source != self._item2.source:
return 'changed link'
def compare_content(path, item1, item2):
if contents_changed(item1, item2):
if item1.get('deleted'):
return 'added {:>13}'.format(format_file_size(sum_chunk_size(item2)))
if item2.get('deleted'):
return 'removed {:>11}'.format(format_file_size(sum_chunk_size(item1)))
if not can_compare_chunk_ids:
def _content_string(self):
if self._item1.get('deleted'):
return ('added {:>13}'.format(format_file_size(self._item2.get_size())))
if self._item2.get('deleted'):
return ('removed {:>11}'.format(format_file_size(self._item1.get_size())))
if not self._can_compare_chunk_ids:
return 'modified'
chunk_ids1 = {c.id for c in item1.chunks}
chunk_ids2 = {c.id for c in item2.chunks}
chunk_ids1 = {c.id for c in self._item1.chunks}
chunk_ids2 = {c.id for c in self._item2.chunks}
added_ids = chunk_ids2 - chunk_ids1
removed_ids = chunk_ids1 - chunk_ids2
added = sum_chunk_size(item2, added_ids)
removed = sum_chunk_size(item1, removed_ids)
return '{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True))
added = self._item2.get_size(consider_ids=added_ids)
removed = self._item1.get_size(consider_ids=removed_ids)
return ('{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True)))
def compare_directory(item1, item2):
if item2.get('deleted') and not item1.get('deleted'):
def _dir_string(self):
if self._item2.get('deleted') and not self._item1.get('deleted'):
return 'removed directory'
if item1.get('deleted') and not item2.get('deleted'):
if self._item1.get('deleted') and not self._item2.get('deleted'):
return 'added directory'
def compare_owner(item1, item2):
user1, group1 = get_owner(item1)
user2, group2 = get_owner(item2)
if user1 != user2 or group1 != group2:
return '[{}:{} -> {}:{}]'.format(user1, group1, user2, group2)
def _owner_string(self):
u_attr, g_attr = ('uid', 'gid') if self._numeric_owner else ('user', 'group')
u1, g1 = self._item1.get(u_attr), self._item1.get(g_attr)
u2, g2 = self._item2.get(u_attr), self._item2.get(g_attr)
if (u1, g1) != (u2, g2):
return '[{}:{} -> {}:{}]'.format(u1, g1, u2, g2)
def compare_mode(item1, item2):
if item1.mode != item2.mode:
return '[{} -> {}]'.format(get_mode(item1), get_mode(item2))
def _mode_string(self):
if 'mode' in self._item1 and 'mode' in self._item2 and self._item1.mode != self._item2.mode:
return '[{} -> {}]'.format(stat.filemode(self._item1.mode), stat.filemode(self._item2.mode))
def contents_changed(item1, item2):
if can_compare_chunk_ids:
return item1.chunks != item2.chunks
if sum_chunk_size(item1) != sum_chunk_size(item2):
return True
chunk_ids1 = [c.id for c in item1.chunks]
chunk_ids2 = [c.id for c in item2.chunks]
return not fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)
def _content_equal(self, chunk_iterator1, chunk_iterator2):
if self._can_compare_chunk_ids:
return self._item1.chunks == self._item2.chunks
if self._item1.get_size() != self._item2.get_size():
return False
return ItemDiff._chunk_content_equal(chunk_iterator1, chunk_iterator2)
@staticmethod
def compare_chunk_contents(chunks1, chunks2):
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)"""
def _chunk_content_equal(chunks1, chunks2):
"""
Compare chunk content and return True if they are identical.
The chunks must be given as chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`).
"""
end = object()
alen = ai = 0
blen = bi = 0

View File

@ -47,7 +47,7 @@ from ..helpers import bin_to_hex
from ..helpers import MAX_S
from ..nanorst import RstToTextLazy, rst_to_terminal
from ..patterns import IECommand, PatternMatcher, parse_pattern
from ..item import Item
from ..item import Item, ItemDiff
from ..logger import setup_logging
from ..remote import RemoteRepository, PathNotAllowed
from ..repository import Repository
@ -3402,12 +3402,12 @@ def test_get_args():
assert args.func == archiver.do_serve
def test_compare_chunk_contents():
def test_chunk_content_equal():
def ccc(a, b):
chunks_a = [data for data in a]
chunks_b = [data for data in b]
compare1 = Archiver.compare_chunk_contents(iter(chunks_a), iter(chunks_b))
compare2 = Archiver.compare_chunk_contents(iter(chunks_b), iter(chunks_a))
compare1 = ItemDiff._chunk_content_equal(iter(chunks_a), iter(chunks_b))
compare2 = ItemDiff._chunk_content_equal(iter(chunks_b), iter(chunks_a))
assert compare1 == compare2
return compare1
assert ccc([