Refactor the diff functionality

This factors out a lot of the logic in do_diff in archiver.py to Archive in
archive.py and a new class ItemDiff in item.pyx. The idea is to move methods
to the classes that are affected and to make it reusable, primarily for a new
option to fuse (#2475).
This commit is contained in:
Simon Frei 2017-07-19 14:29:14 +02:00
parent 37f75519cf
commit 9dc22d230f
4 changed files with 174 additions and 136 deletions

View File

@ -5,12 +5,13 @@ import socket
import stat import stat
import sys import sys
import time import time
from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from functools import partial from functools import partial
from getpass import getuser from getpass import getuser
from io import BytesIO from io import BytesIO
from itertools import groupby from itertools import groupby, zip_longest
from shutil import get_terminal_size from shutil import get_terminal_size
import msgpack import msgpack
@ -40,7 +41,7 @@ from .helpers import bin_to_hex
from .helpers import safe_ns from .helpers import safe_ns
from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi
from .patterns import PathPrefixPattern, FnmatchPattern, IECommand from .patterns import PathPrefixPattern, FnmatchPattern, IECommand
from .item import Item, ArchiveItem from .item import Item, ArchiveItem, ItemDiff
from .platform import acl_get, acl_set, set_flags, get_flags, swidth from .platform import acl_get, acl_set, set_flags, get_flags, swidth
from .remote import cache_if_remote from .remote import cache_if_remote
from .repository import Repository, LIST_SCAN_LIMIT from .repository import Repository, LIST_SCAN_LIMIT
@ -819,34 +820,14 @@ Utilization of max. archive size: {csize_max:.0%}
# Was this EPERM due to the O_NOATIME flag? Try again without it: # Was this EPERM due to the O_NOATIME flag? Try again without it:
return os.open(path, flags_normal) return os.open(path, flags_normal)
def compare_archives(archive1, archive2, matcher): @staticmethod
def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_ids=False):
"""
Yields tuples with a path and an ItemDiff instance describing changes/indicating equality.
def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2): :param matcher: PatternMatcher class to restrict results to only matching paths.
chunks1 = archive1.pipeline.fetch_many(chunk_ids1) :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
chunks2 = archive2.pipeline.fetch_many(chunk_ids2) """
return self.compare_chunk_contents(chunks1, chunks2)
def sum_chunk_size(item, consider_ids=None):
if item.get('deleted'):
size = None
else:
if consider_ids is not None: # consider only specific chunks
size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids)
else: # consider all chunks
size = item.get_size()
return size
def get_owner(item):
if args.numeric_owner:
return item.uid, item.gid
else:
return item.user, item.group
def get_mode(item):
if 'mode' in item:
return stat.filemode(item.mode)
else:
return [None]
def hardlink_master_seen(item): def hardlink_master_seen(item):
return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
@ -861,93 +842,66 @@ Utilization of max. archive size: {csize_max:.0%}
def has_hardlink_master(item, hardlink_masters): def has_hardlink_master(item, hardlink_masters):
return hardlinkable(item.mode) and item.get('source') in hardlink_masters return hardlinkable(item.mode) and item.get('source') in hardlink_masters
def compare_items(output, path, item1, item2, hardlink_masters, deleted=False): def compare_items(item1, item2):
"""
Compare two items with identical paths.
:param deleted: Whether one of the items has been deleted
"""
changes = []
if has_hardlink_master(item1, hardlink_masters): if has_hardlink_master(item1, hardlink_masters):
item1 = hardlink_masters[item1.source][0] item1 = hardlink_masters[item1.source][0]
if has_hardlink_master(item2, hardlink_masters): if has_hardlink_master(item2, hardlink_masters):
item2 = hardlink_masters[item2.source][1] item2 = hardlink_masters[item2.source][1]
return ItemDiff(item1, item2,
archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]),
archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]),
can_compare_chunk_ids=can_compare_chunk_ids)
if get_mode(item1)[0] == 'l' or get_mode(item2)[0] == 'l': def defer_if_necessary(item1, item2):
changes.append(compare_link(item1, item2)) """Adds item tuple to deferred if necessary and returns True, if items were deferred"""
if 'chunks' in item1 and 'chunks' in item2:
changes.append(compare_content(path, item1, item2))
if get_mode(item1)[0] == 'd' or get_mode(item2)[0] == 'd':
changes.append(compare_directory(item1, item2))
if not deleted:
changes.append(compare_owner(item1, item2))
changes.append(compare_mode(item1, item2))
changes = [x for x in changes if x]
if changes:
output_line = (remove_surrogates(path), ' '.join(changes))
if args.sort:
output.append(output_line)
else:
print_output(output_line)
def compare_or_defer(item1, item2):
update_hardlink_masters(item1, item2) update_hardlink_masters(item1, item2)
if not hardlink_master_seen(item1) or not hardlink_master_seen(item2): defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2)
if defer:
deferred.append((item1, item2)) deferred.append((item1, item2))
else: return defer
compare_items(output, item1.path, item1, item2, hardlink_masters)
orphans_archive1 = collections.OrderedDict() orphans_archive1 = OrderedDict()
orphans_archive2 = collections.OrderedDict() orphans_archive2 = OrderedDict()
deferred = [] deferred = []
hardlink_masters = {} hardlink_masters = {}
output = []
for item1, item2 in zip_longest( for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item.path)), archive1.iter_items(lambda item: matcher.match(item.path)),
archive2.iter_items(lambda item: matcher.match(item.path)), archive2.iter_items(lambda item: matcher.match(item.path)),
): ):
if item1 and item2 and item1.path == item2.path: if item1 and item2 and item1.path == item2.path:
compare_or_defer(item1, item2) if not defer_if_necessary(item1, item2):
yield (item1.path, compare_items(item1, item2))
continue continue
if item1: if item1:
matching_orphan = orphans_archive2.pop(item1.path, None) matching_orphan = orphans_archive2.pop(item1.path, None)
if matching_orphan: if matching_orphan:
compare_or_defer(item1, matching_orphan) if not defer_if_necessary(item1, matching_orphan):
yield (item1.path, compare_items(item1, matching_orphan))
else: else:
orphans_archive1[item1.path] = item1 orphans_archive1[item1.path] = item1
if item2: if item2:
matching_orphan = orphans_archive1.pop(item2.path, None) matching_orphan = orphans_archive1.pop(item2.path, None)
if matching_orphan: if matching_orphan:
compare_or_defer(matching_orphan, item2) if not defer_if_necessary(matching_orphan, item2):
yield (matching_orphan.path, compare_items(matching_orphan, item2))
else: else:
orphans_archive2[item2.path] = item2 orphans_archive2[item2.path] = item2
# At this point orphans_* contain items that had no matching partner in the other archive # At this point orphans_* contain items that had no matching partner in the other archive
deleted_item = Item(
deleted=True,
chunks=[],
mode=0,
)
for added in orphans_archive2.values(): for added in orphans_archive2.values():
path = added.path path = added.path
deleted_item.path = path deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted_item, added) update_hardlink_masters(deleted_item, added)
compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True) yield (path, compare_items(deleted_item, added))
for deleted in orphans_archive1.values(): for deleted in orphans_archive1.values():
path = deleted.path path = deleted.path
deleted_item.path = path deleted_item = Item.create_deleted(path)
update_hardlink_masters(deleted, deleted_item) update_hardlink_masters(deleted, deleted_item)
compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True) yield (path, compare_items(deleted, deleted_item))
for item1, item2 in deferred: for item1, item2 in deferred:
assert hardlink_master_seen(item1) assert hardlink_master_seen(item1)
assert hardlink_master_seen(item2) assert hardlink_master_seen(item2)
compare_items(output, item1.path, item1, item2, hardlink_masters) yield (path, compare_items(item1, item2))
class MetadataCollector: class MetadataCollector:

View File

@ -940,8 +940,8 @@ class Archiver:
def do_diff(self, args, repository, manifest, key, archive): def do_diff(self, args, repository, manifest, key, archive):
"""Diff contents of two archives""" """Diff contents of two archives"""
def print_output(line): def print_output(diff, path):
print("{:<19} {}".format(line[1], line[0])) print("{:<19} {}".format(diff, path))
archive1 = archive archive1 = archive
archive2 = Archive(repository, key, manifest, args.archive2, archive2 = Archive(repository, key, manifest, args.archive2,
@ -956,10 +956,15 @@ class Archiver:
matcher = self.build_matcher(args.patterns, args.paths) matcher = self.build_matcher(args.patterns, args.paths)
compare_archives(archive1, archive2, matcher) diffs = Archive.compare_archives_iter(archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids)
# Conversion to string and filtering for diff.equal to save memory if sorting
diffs = ((path, str(diff)) for path, diff in diffs if not diff.equal)
for line in sorted(output): if args.sort:
print_output(line) diffs = sorted(diffs)
for path, diff in diffs:
print_output(diff, path)
for pattern in matcher.get_unmatched_include_patterns(): for pattern in matcher.get_unmatched_include_patterns():
self.print_warning("Include pattern '%s' never matched.", pattern) self.print_warning("Include pattern '%s' never matched.", pattern)

View File

@ -5,6 +5,7 @@ from .constants import ITEM_KEYS
from .helpers import safe_encode, safe_decode from .helpers import safe_encode, safe_decode
from .helpers import bigint_to_int, int_to_bigint from .helpers import bigint_to_int, int_to_bigint
from .helpers import StableDict from .helpers import StableDict
from .helpers import format_file_size
cdef extern from "_item.c": cdef extern from "_item.c":
object _object_to_optr(object obj) object _object_to_optr(object obj)
@ -184,19 +185,22 @@ class Item(PropDict):
part = PropDict._make_property('part', int) part = PropDict._make_property('part', int)
def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False): def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None):
""" """
Determine the (uncompressed or compressed) size of this item. Determine the (uncompressed or compressed) size of this item.
For hardlink slaves, the size is computed via the hardlink master's :param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list,
chunk list, if available (otherwise size will be returned as 0). otherwise size will be returned as 0.
:param memorize: Whether the computed size value will be stored into the item.
If memorize is True, the computed size value will be stored into the item. :param compressed: Whether the compressed or uncompressed size will be returned.
:param from_chunks: If true, size is computed from chunks even if a precomputed value is available.
:param consider_ids: Returns the size of the given ids only.
""" """
attr = 'csize' if compressed else 'size' attr = 'csize' if compressed else 'size'
assert not (compressed and memorize), 'Item does not have a csize field.' assert not (compressed and memorize), 'Item does not have a csize field.'
assert not (consider_ids is not None and memorize), "Can't store size when considering only certain ids"
try: try:
if from_chunks: if from_chunks or consider_ids is not None:
raise AttributeError raise AttributeError
size = getattr(self, attr) size = getattr(self, attr)
except AttributeError: except AttributeError:
@ -226,6 +230,9 @@ class Item(PropDict):
chunks, _ = hardlink_masters.get(master, (None, None)) chunks, _ = hardlink_masters.get(master, (None, None))
if chunks is None: if chunks is None:
return 0 return 0
if consider_ids is not None:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids)
else:
size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks) size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks)
# if requested, memorize the precomputed (c)size for items that have an own chunks list: # if requested, memorize the precomputed (c)size for items that have an own chunks list:
if memorize and having_chunks: if memorize and having_chunks:
@ -251,6 +258,21 @@ class Item(PropDict):
def from_optr(self, optr): def from_optr(self, optr):
return _optr_to_object(optr) return _optr_to_object(optr)
@classmethod
def create_deleted(cls, path):
return cls(deleted=True, chunks=[], mode=0, path=path)
def is_link(self):
return self._is_type(stat.S_ISLNK)
def is_dir(self):
return self._is_type(stat.S_ISDIR)
def _is_type(self, typetest):
try:
return typetest(self.mode)
except AttributeError:
return False
class EncryptedKey(PropDict): class EncryptedKey(PropDict):
@ -359,62 +381,119 @@ class ManifestItem(PropDict):
config = PropDict._make_property('config', dict) config = PropDict._make_property('config', dict)
item_keys = PropDict._make_property('item_keys', tuple) item_keys = PropDict._make_property('item_keys', tuple)
def compare_link(item1, item2): class ItemDiff:
# These are the simple link cases. For special cases, e.g. if a """
# regular file is replaced with a link or vice versa, it is Comparison of two items from different archives.
# indicated in compare_mode instead.
if item1.get('deleted'): The items may have different paths and still be considered equal (e.g. for renames).
It does not include extended or time attributes in the comparison.
"""
def __init__(self, item1, item2, chunk_iterator1, chunk_iterator2, numeric_owner=False, can_compare_chunk_ids=False):
self._item1 = item1
self._item2 = item2
self._numeric_owner = numeric_owner
self._can_compare_chunk_ids = can_compare_chunk_ids
self.equal = self._equal(chunk_iterator1, chunk_iterator2)
def __repr__(self):
if self.equal:
return 'equal'
changes = []
if self._item1.is_link() or self._item2.is_link():
changes.append(self._link_string())
if 'chunks' in self._item1 and 'chunks' in self._item2:
changes.append(self._content_string())
if self._item1.is_dir() or self._item2.is_dir():
changes.append(self._dir_string())
if not (self._item1.get('deleted') or self._item2.get('deleted')):
changes.append(self._owner_string())
changes.append(self._mode_string())
return ' '.join((x for x in changes if x))
def _equal(self, chunk_iterator1, chunk_iterator2):
# if both are deleted, there is nothing at path regardless of what was deleted
if self._item1.get('deleted') and self._item2.get('deleted'):
return True
attr_list = ['deleted', 'mode', 'source']
attr_list += ['uid', 'gid'] if self._numeric_owner else ['user', 'group']
for attr in attr_list:
if self._item1.get(attr) != self._item2.get(attr):
return False
if 'mode' in self._item1: # mode of item1 and item2 is equal
if (self._item1.is_link() and 'source' in self._item1 and 'source' in self._item2
and self._item1.source != self._item2.source):
return False
if 'chunks' in self._item1 and 'chunks' in self._item2:
return self._content_equal(chunk_iterator1, chunk_iterator2)
return True
def _link_string(self):
if self._item1.get('deleted'):
return 'added link' return 'added link'
if item2.get('deleted'): if self._item2.get('deleted'):
return 'removed link' return 'removed link'
if 'source' in item1 and 'source' in item2 and item1.source != item2.source: if 'source' in self._item1 and 'source' in self._item2 and self._item1.source != self._item2.source:
return 'changed link' return 'changed link'
def compare_content(path, item1, item2): def _content_string(self):
if contents_changed(item1, item2): if self._item1.get('deleted'):
if item1.get('deleted'): return ('added {:>13}'.format(format_file_size(self._item2.get_size())))
return 'added {:>13}'.format(format_file_size(sum_chunk_size(item2))) if self._item2.get('deleted'):
if item2.get('deleted'): return ('removed {:>11}'.format(format_file_size(self._item1.get_size())))
return 'removed {:>11}'.format(format_file_size(sum_chunk_size(item1))) if not self._can_compare_chunk_ids:
if not can_compare_chunk_ids:
return 'modified' return 'modified'
chunk_ids1 = {c.id for c in item1.chunks} chunk_ids1 = {c.id for c in self._item1.chunks}
chunk_ids2 = {c.id for c in item2.chunks} chunk_ids2 = {c.id for c in self._item2.chunks}
added_ids = chunk_ids2 - chunk_ids1 added_ids = chunk_ids2 - chunk_ids1
removed_ids = chunk_ids1 - chunk_ids2 removed_ids = chunk_ids1 - chunk_ids2
added = sum_chunk_size(item2, added_ids) added = self._item2.get_size(consider_ids=added_ids)
removed = sum_chunk_size(item1, removed_ids) removed = self._item1.get_size(consider_ids=removed_ids)
return '{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True), return ('{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True)) format_file_size(-removed, precision=1, sign=True)))
def compare_directory(item1, item2): def _dir_string(self):
if item2.get('deleted') and not item1.get('deleted'): if self._item2.get('deleted') and not self._item1.get('deleted'):
return 'removed directory' return 'removed directory'
if item1.get('deleted') and not item2.get('deleted'): if self._item1.get('deleted') and not self._item2.get('deleted'):
return 'added directory' return 'added directory'
def compare_owner(item1, item2): def _owner_string(self):
user1, group1 = get_owner(item1) u_attr, g_attr = ('uid', 'gid') if self._numeric_owner else ('user', 'group')
user2, group2 = get_owner(item2) u1, g1 = self._item1.get(u_attr), self._item1.get(g_attr)
if user1 != user2 or group1 != group2: u2, g2 = self._item2.get(u_attr), self._item2.get(g_attr)
return '[{}:{} -> {}:{}]'.format(user1, group1, user2, group2) if (u1, g1) != (u2, g2):
return '[{}:{} -> {}:{}]'.format(u1, g1, u2, g2)
def compare_mode(item1, item2): def _mode_string(self):
if item1.mode != item2.mode: if 'mode' in self._item1 and 'mode' in self._item2 and self._item1.mode != self._item2.mode:
return '[{} -> {}]'.format(get_mode(item1), get_mode(item2)) return '[{} -> {}]'.format(stat.filemode(self._item1.mode), stat.filemode(self._item2.mode))
def contents_changed(item1, item2): def _content_equal(self, chunk_iterator1, chunk_iterator2):
if can_compare_chunk_ids: if self._can_compare_chunk_ids:
return item1.chunks != item2.chunks return self._item1.chunks == self._item2.chunks
if sum_chunk_size(item1) != sum_chunk_size(item2): if self._item1.get_size() != self._item2.get_size():
return True return False
chunk_ids1 = [c.id for c in item1.chunks] return ItemDiff._chunk_content_equal(chunk_iterator1, chunk_iterator2)
chunk_ids2 = [c.id for c in item2.chunks]
return not fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2)
@staticmethod @staticmethod
def compare_chunk_contents(chunks1, chunks2): def _chunk_content_equal(chunks1, chunks2):
"""Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)""" """
Compare chunk content and return True if they are identical.
The chunks must be given as chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`).
"""
end = object() end = object()
alen = ai = 0 alen = ai = 0
blen = bi = 0 blen = bi = 0

View File

@ -47,7 +47,7 @@ from ..helpers import bin_to_hex
from ..helpers import MAX_S from ..helpers import MAX_S
from ..nanorst import RstToTextLazy, rst_to_terminal from ..nanorst import RstToTextLazy, rst_to_terminal
from ..patterns import IECommand, PatternMatcher, parse_pattern from ..patterns import IECommand, PatternMatcher, parse_pattern
from ..item import Item from ..item import Item, ItemDiff
from ..logger import setup_logging from ..logger import setup_logging
from ..remote import RemoteRepository, PathNotAllowed from ..remote import RemoteRepository, PathNotAllowed
from ..repository import Repository from ..repository import Repository
@ -3402,12 +3402,12 @@ def test_get_args():
assert args.func == archiver.do_serve assert args.func == archiver.do_serve
def test_compare_chunk_contents(): def test_chunk_content_equal():
def ccc(a, b): def ccc(a, b):
chunks_a = [data for data in a] chunks_a = [data for data in a]
chunks_b = [data for data in b] chunks_b = [data for data in b]
compare1 = Archiver.compare_chunk_contents(iter(chunks_a), iter(chunks_b)) compare1 = ItemDiff._chunk_content_equal(iter(chunks_a), iter(chunks_b))
compare2 = Archiver.compare_chunk_contents(iter(chunks_b), iter(chunks_a)) compare2 = ItemDiff._chunk_content_equal(iter(chunks_b), iter(chunks_a))
assert compare1 == compare2 assert compare1 == compare2
return compare1 return compare1
assert ccc([ assert ccc([