From 9dc22d230f07c7ddb06cf3cf3bd90a41110305f9 Mon Sep 17 00:00:00 2001 From: Simon Frei Date: Wed, 19 Jul 2017 14:29:14 +0200 Subject: [PATCH] Refactor the diff functionality This factors out a lot of the logic in do_diff in archiver.py to Archive in archive.py and a new class ItemDiff in item.pyx. The idea is to move methods to the classes that are affected and to make it reusable, primarily for a new option to fuse (#2475). --- src/borg/archive.py | 112 +++++++-------------- src/borg/archiver.py | 15 ++- src/borg/item.pyx | 175 ++++++++++++++++++++++++--------- src/borg/testsuite/archiver.py | 8 +- 4 files changed, 174 insertions(+), 136 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index a3829b661..00aa9f78e 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -5,12 +5,13 @@ import socket import stat import sys import time +from collections import OrderedDict from contextlib import contextmanager from datetime import datetime, timezone, timedelta from functools import partial from getpass import getuser from io import BytesIO -from itertools import groupby +from itertools import groupby, zip_longest from shutil import get_terminal_size import msgpack @@ -40,7 +41,7 @@ from .helpers import bin_to_hex from .helpers import safe_ns from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .patterns import PathPrefixPattern, FnmatchPattern, IECommand -from .item import Item, ArchiveItem +from .item import Item, ArchiveItem, ItemDiff from .platform import acl_get, acl_set, set_flags, get_flags, swidth from .remote import cache_if_remote from .repository import Repository, LIST_SCAN_LIMIT @@ -819,34 +820,14 @@ Utilization of max. archive size: {csize_max:.0%} # Was this EPERM due to the O_NOATIME flag? Try again without it: return os.open(path, flags_normal) - def compare_archives(archive1, archive2, matcher): + @staticmethod + def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_ids=False): + """ + Yields tuples with a path and an ItemDiff instance describing changes/indicating equality. - def fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2): - chunks1 = archive1.pipeline.fetch_many(chunk_ids1) - chunks2 = archive2.pipeline.fetch_many(chunk_ids2) - return self.compare_chunk_contents(chunks1, chunks2) - - def sum_chunk_size(item, consider_ids=None): - if item.get('deleted'): - size = None - else: - if consider_ids is not None: # consider only specific chunks - size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids) - else: # consider all chunks - size = item.get_size() - return size - - def get_owner(item): - if args.numeric_owner: - return item.uid, item.gid - else: - return item.user, item.group - - def get_mode(item): - if 'mode' in item: - return stat.filemode(item.mode) - else: - return [None] + :param matcher: PatternMatcher class to restrict results to only matching paths. + :param can_compare_chunk_ids: Whether --chunker-params are the same for both archives. + """ def hardlink_master_seen(item): return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters @@ -861,93 +842,66 @@ Utilization of max. archive size: {csize_max:.0%} def has_hardlink_master(item, hardlink_masters): return hardlinkable(item.mode) and item.get('source') in hardlink_masters - def compare_items(output, path, item1, item2, hardlink_masters, deleted=False): - """ - Compare two items with identical paths. - :param deleted: Whether one of the items has been deleted - """ - changes = [] - + def compare_items(item1, item2): if has_hardlink_master(item1, hardlink_masters): item1 = hardlink_masters[item1.source][0] - if has_hardlink_master(item2, hardlink_masters): item2 = hardlink_masters[item2.source][1] + return ItemDiff(item1, item2, + archive1.pipeline.fetch_many([c.id for c in item1.get('chunks', [])]), + archive2.pipeline.fetch_many([c.id for c in item2.get('chunks', [])]), + can_compare_chunk_ids=can_compare_chunk_ids) - if get_mode(item1)[0] == 'l' or get_mode(item2)[0] == 'l': - changes.append(compare_link(item1, item2)) - - if 'chunks' in item1 and 'chunks' in item2: - changes.append(compare_content(path, item1, item2)) - - if get_mode(item1)[0] == 'd' or get_mode(item2)[0] == 'd': - changes.append(compare_directory(item1, item2)) - - if not deleted: - changes.append(compare_owner(item1, item2)) - changes.append(compare_mode(item1, item2)) - - changes = [x for x in changes if x] - if changes: - output_line = (remove_surrogates(path), ' '.join(changes)) - - if args.sort: - output.append(output_line) - else: - print_output(output_line) - - def compare_or_defer(item1, item2): + def defer_if_necessary(item1, item2): + """Adds item tuple to deferred if necessary and returns True, if items were deferred""" update_hardlink_masters(item1, item2) - if not hardlink_master_seen(item1) or not hardlink_master_seen(item2): + defer = not hardlink_master_seen(item1) or not hardlink_master_seen(item2) + if defer: deferred.append((item1, item2)) - else: - compare_items(output, item1.path, item1, item2, hardlink_masters) + return defer - orphans_archive1 = collections.OrderedDict() - orphans_archive2 = collections.OrderedDict() + orphans_archive1 = OrderedDict() + orphans_archive2 = OrderedDict() deferred = [] hardlink_masters = {} - output = [] for item1, item2 in zip_longest( archive1.iter_items(lambda item: matcher.match(item.path)), archive2.iter_items(lambda item: matcher.match(item.path)), ): if item1 and item2 and item1.path == item2.path: - compare_or_defer(item1, item2) + if not defer_if_necessary(item1, item2): + yield (item1.path, compare_items(item1, item2)) continue if item1: matching_orphan = orphans_archive2.pop(item1.path, None) if matching_orphan: - compare_or_defer(item1, matching_orphan) + if not defer_if_necessary(item1, matching_orphan): + yield (item1.path, compare_items(item1, matching_orphan)) else: orphans_archive1[item1.path] = item1 if item2: matching_orphan = orphans_archive1.pop(item2.path, None) if matching_orphan: - compare_or_defer(matching_orphan, item2) + if not defer_if_necessary(matching_orphan, item2): + yield (matching_orphan.path, compare_items(matching_orphan, item2)) else: orphans_archive2[item2.path] = item2 # At this point orphans_* contain items that had no matching partner in the other archive - deleted_item = Item( - deleted=True, - chunks=[], - mode=0, - ) for added in orphans_archive2.values(): path = added.path - deleted_item.path = path + deleted_item = Item.create_deleted(path) update_hardlink_masters(deleted_item, added) - compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True) + yield (path, compare_items(deleted_item, added)) for deleted in orphans_archive1.values(): path = deleted.path - deleted_item.path = path + deleted_item = Item.create_deleted(path) update_hardlink_masters(deleted, deleted_item) - compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True) + yield (path, compare_items(deleted, deleted_item)) for item1, item2 in deferred: assert hardlink_master_seen(item1) assert hardlink_master_seen(item2) - compare_items(output, item1.path, item1, item2, hardlink_masters) + yield (path, compare_items(item1, item2)) class MetadataCollector: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 1b6ce0324..4d2eb72ef 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -940,8 +940,8 @@ class Archiver: def do_diff(self, args, repository, manifest, key, archive): """Diff contents of two archives""" - def print_output(line): - print("{:<19} {}".format(line[1], line[0])) + def print_output(diff, path): + print("{:<19} {}".format(diff, path)) archive1 = archive archive2 = Archive(repository, key, manifest, args.archive2, @@ -956,10 +956,15 @@ class Archiver: matcher = self.build_matcher(args.patterns, args.paths) - compare_archives(archive1, archive2, matcher) + diffs = Archive.compare_archives_iter(archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids) + # Conversion to string and filtering for diff.equal to save memory if sorting + diffs = ((path, str(diff)) for path, diff in diffs if not diff.equal) - for line in sorted(output): - print_output(line) + if args.sort: + diffs = sorted(diffs) + + for path, diff in diffs: + print_output(diff, path) for pattern in matcher.get_unmatched_include_patterns(): self.print_warning("Include pattern '%s' never matched.", pattern) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 3d4f61fb1..a8a21c045 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -5,6 +5,7 @@ from .constants import ITEM_KEYS from .helpers import safe_encode, safe_decode from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict +from .helpers import format_file_size cdef extern from "_item.c": object _object_to_optr(object obj) @@ -184,19 +185,22 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False): + def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False, consider_ids=None): """ Determine the (uncompressed or compressed) size of this item. - For hardlink slaves, the size is computed via the hardlink master's - chunk list, if available (otherwise size will be returned as 0). - - If memorize is True, the computed size value will be stored into the item. + :param hardlink_masters: If given, the size of hardlink slaves is computed via the hardlink master's chunk list, + otherwise size will be returned as 0. + :param memorize: Whether the computed size value will be stored into the item. + :param compressed: Whether the compressed or uncompressed size will be returned. + :param from_chunks: If true, size is computed from chunks even if a precomputed value is available. + :param consider_ids: Returns the size of the given ids only. """ attr = 'csize' if compressed else 'size' assert not (compressed and memorize), 'Item does not have a csize field.' + assert not (consider_ids is not None and memorize), "Can't store size when considering only certain ids" try: - if from_chunks: + if from_chunks or consider_ids is not None: raise AttributeError size = getattr(self, attr) except AttributeError: @@ -226,7 +230,10 @@ class Item(PropDict): chunks, _ = hardlink_masters.get(master, (None, None)) if chunks is None: return 0 - size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks) + if consider_ids is not None: + size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks if chunk.id in consider_ids) + else: + size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks) # if requested, memorize the precomputed (c)size for items that have an own chunks list: if memorize and having_chunks: setattr(self, attr, size) @@ -251,6 +258,21 @@ class Item(PropDict): def from_optr(self, optr): return _optr_to_object(optr) + @classmethod + def create_deleted(cls, path): + return cls(deleted=True, chunks=[], mode=0, path=path) + + def is_link(self): + return self._is_type(stat.S_ISLNK) + + def is_dir(self): + return self._is_type(stat.S_ISDIR) + + def _is_type(self, typetest): + try: + return typetest(self.mode) + except AttributeError: + return False class EncryptedKey(PropDict): @@ -359,62 +381,119 @@ class ManifestItem(PropDict): config = PropDict._make_property('config', dict) item_keys = PropDict._make_property('item_keys', tuple) - def compare_link(item1, item2): - # These are the simple link cases. For special cases, e.g. if a - # regular file is replaced with a link or vice versa, it is - # indicated in compare_mode instead. - if item1.get('deleted'): +class ItemDiff: + """ + Comparison of two items from different archives. + + The items may have different paths and still be considered equal (e.g. for renames). + It does not include extended or time attributes in the comparison. + """ + + def __init__(self, item1, item2, chunk_iterator1, chunk_iterator2, numeric_owner=False, can_compare_chunk_ids=False): + self._item1 = item1 + self._item2 = item2 + self._numeric_owner = numeric_owner + self._can_compare_chunk_ids = can_compare_chunk_ids + self.equal = self._equal(chunk_iterator1, chunk_iterator2) + + def __repr__(self): + if self.equal: + return 'equal' + + changes = [] + + if self._item1.is_link() or self._item2.is_link(): + changes.append(self._link_string()) + + if 'chunks' in self._item1 and 'chunks' in self._item2: + changes.append(self._content_string()) + + if self._item1.is_dir() or self._item2.is_dir(): + changes.append(self._dir_string()) + + if not (self._item1.get('deleted') or self._item2.get('deleted')): + changes.append(self._owner_string()) + changes.append(self._mode_string()) + + return ' '.join((x for x in changes if x)) + + def _equal(self, chunk_iterator1, chunk_iterator2): + # if both are deleted, there is nothing at path regardless of what was deleted + if self._item1.get('deleted') and self._item2.get('deleted'): + return True + + attr_list = ['deleted', 'mode', 'source'] + attr_list += ['uid', 'gid'] if self._numeric_owner else ['user', 'group'] + for attr in attr_list: + if self._item1.get(attr) != self._item2.get(attr): + return False + + if 'mode' in self._item1: # mode of item1 and item2 is equal + if (self._item1.is_link() and 'source' in self._item1 and 'source' in self._item2 + and self._item1.source != self._item2.source): + return False + + if 'chunks' in self._item1 and 'chunks' in self._item2: + return self._content_equal(chunk_iterator1, chunk_iterator2) + + return True + + def _link_string(self): + if self._item1.get('deleted'): return 'added link' - if item2.get('deleted'): + if self._item2.get('deleted'): return 'removed link' - if 'source' in item1 and 'source' in item2 and item1.source != item2.source: + if 'source' in self._item1 and 'source' in self._item2 and self._item1.source != self._item2.source: return 'changed link' -def compare_content(path, item1, item2): - if contents_changed(item1, item2): - if item1.get('deleted'): - return 'added {:>13}'.format(format_file_size(sum_chunk_size(item2))) - if item2.get('deleted'): - return 'removed {:>11}'.format(format_file_size(sum_chunk_size(item1))) - if not can_compare_chunk_ids: + def _content_string(self): + if self._item1.get('deleted'): + return ('added {:>13}'.format(format_file_size(self._item2.get_size()))) + if self._item2.get('deleted'): + return ('removed {:>11}'.format(format_file_size(self._item1.get_size()))) + if not self._can_compare_chunk_ids: return 'modified' - chunk_ids1 = {c.id for c in item1.chunks} - chunk_ids2 = {c.id for c in item2.chunks} + chunk_ids1 = {c.id for c in self._item1.chunks} + chunk_ids2 = {c.id for c in self._item2.chunks} added_ids = chunk_ids2 - chunk_ids1 removed_ids = chunk_ids1 - chunk_ids2 - added = sum_chunk_size(item2, added_ids) - removed = sum_chunk_size(item1, removed_ids) - return '{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True), - format_file_size(-removed, precision=1, sign=True)) + added = self._item2.get_size(consider_ids=added_ids) + removed = self._item1.get_size(consider_ids=removed_ids) + return ('{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True), + format_file_size(-removed, precision=1, sign=True))) - def compare_directory(item1, item2): - if item2.get('deleted') and not item1.get('deleted'): + def _dir_string(self): + if self._item2.get('deleted') and not self._item1.get('deleted'): return 'removed directory' - if item1.get('deleted') and not item2.get('deleted'): + if self._item1.get('deleted') and not self._item2.get('deleted'): return 'added directory' - def compare_owner(item1, item2): - user1, group1 = get_owner(item1) - user2, group2 = get_owner(item2) - if user1 != user2 or group1 != group2: - return '[{}:{} -> {}:{}]'.format(user1, group1, user2, group2) + def _owner_string(self): + u_attr, g_attr = ('uid', 'gid') if self._numeric_owner else ('user', 'group') + u1, g1 = self._item1.get(u_attr), self._item1.get(g_attr) + u2, g2 = self._item2.get(u_attr), self._item2.get(g_attr) + if (u1, g1) != (u2, g2): + return '[{}:{} -> {}:{}]'.format(u1, g1, u2, g2) - def compare_mode(item1, item2): - if item1.mode != item2.mode: - return '[{} -> {}]'.format(get_mode(item1), get_mode(item2)) + def _mode_string(self): + if 'mode' in self._item1 and 'mode' in self._item2 and self._item1.mode != self._item2.mode: + return '[{} -> {}]'.format(stat.filemode(self._item1.mode), stat.filemode(self._item2.mode)) - def contents_changed(item1, item2): - if can_compare_chunk_ids: - return item1.chunks != item2.chunks - if sum_chunk_size(item1) != sum_chunk_size(item2): - return True - chunk_ids1 = [c.id for c in item1.chunks] - chunk_ids2 = [c.id for c in item2.chunks] - return not fetch_and_compare_chunks(chunk_ids1, chunk_ids2, archive1, archive2) + def _content_equal(self, chunk_iterator1, chunk_iterator2): + if self._can_compare_chunk_ids: + return self._item1.chunks == self._item2.chunks + if self._item1.get_size() != self._item2.get_size(): + return False + return ItemDiff._chunk_content_equal(chunk_iterator1, chunk_iterator2) @staticmethod - def compare_chunk_contents(chunks1, chunks2): - """Compare two chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`)""" + def _chunk_content_equal(chunks1, chunks2): + """ + Compare chunk content and return True if they are identical. + + The chunks must be given as chunk iterators (like returned by :meth:`.DownloadPipeline.fetch_many`). + """ + end = object() alen = ai = 0 blen = bi = 0 diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 0d28eb310..c65a42ae6 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -47,7 +47,7 @@ from ..helpers import bin_to_hex from ..helpers import MAX_S from ..nanorst import RstToTextLazy, rst_to_terminal from ..patterns import IECommand, PatternMatcher, parse_pattern -from ..item import Item +from ..item import Item, ItemDiff from ..logger import setup_logging from ..remote import RemoteRepository, PathNotAllowed from ..repository import Repository @@ -3402,12 +3402,12 @@ def test_get_args(): assert args.func == archiver.do_serve -def test_compare_chunk_contents(): +def test_chunk_content_equal(): def ccc(a, b): chunks_a = [data for data in a] chunks_b = [data for data in b] - compare1 = Archiver.compare_chunk_contents(iter(chunks_a), iter(chunks_b)) - compare2 = Archiver.compare_chunk_contents(iter(chunks_b), iter(chunks_a)) + compare1 = ItemDiff._chunk_content_equal(iter(chunks_a), iter(chunks_b)) + compare2 = ItemDiff._chunk_content_equal(iter(chunks_b), iter(chunks_a)) assert compare1 == compare2 return compare1 assert ccc([