From f13aef5d83ebcbb2ba825bef481cf0bad70b2a58 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Sun, 17 Apr 2016 16:41:03 +0200 Subject: [PATCH] create: Visit files in inode order --- borg/archiver.py | 90 ++++++++++++++++++++++++-------------- borg/helpers.py | 66 ++++++++++++++++++++++++++++ borg/testsuite/archiver.py | 11 +++-- 3 files changed, 132 insertions(+), 35 deletions(-) diff --git a/borg/archiver.py b/borg/archiver.py index c1b209da8..39dddea56 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -25,6 +25,7 @@ log_multi, PatternMatcher, ItemFormatter from .logger import create_logger, setup_logging logger = create_logger() +from . import helpers from .compress import Compressor, COMPR_BUFFER from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader from .repository import Repository @@ -247,17 +248,18 @@ def create_inner(archive, cache): self.print_file_status(status, path) continue path = os.path.normpath(path) + try: + st = os.lstat(path) + except OSError as e: + self.print_warning('%s: %s', path, e) + continue if args.one_file_system: - try: - restrict_dev = os.lstat(path).st_dev - except OSError as e: - self.print_warning('%s: %s', path, e) - continue + restrict_dev = st.st_dev else: restrict_dev = None self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present, args.keep_tag_files, skip_inodes, path, restrict_dev, - read_special=args.read_special, dry_run=dry_run) + read_special=args.read_special, dry_run=dry_run, st=st) if not dry_run: archive.save(comment=args.comment, timestamp=args.timestamp) if args.progress: @@ -292,16 +294,16 @@ def create_inner(archive, cache): def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, keep_tag_files, skip_inodes, path, restrict_dev, - read_special=False, dry_run=False): + read_special=False, dry_run=False, st=None): if not matcher.match(path): self.print_file_status('x', path) return - - try: - st = os.lstat(path) - except OSError as e: - self.print_warning('%s: %s', path, e) - return + if st is None: + try: + st = os.lstat(path) + except OSError as e: + self.print_warning('%s: %s', path, e) + return if (st.st_ino, st.st_dev) in skip_inodes: return # Entering a new filesystem? @@ -331,15 +333,15 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, if not dry_run: status = archive.process_dir(path, st) try: - entries = os.listdir(path) + entries = helpers.scandir_inorder(path) except OSError as e: status = 'E' self.print_warning('%s: %s', path, e) else: - for filename in sorted(entries): - entry_path = os.path.normpath(os.path.join(path, filename)) + for dirent in entries: + normpath = os.path.normpath(dirent.path) self._process(archive, cache, matcher, exclude_caches, exclude_if_present, - keep_tag_files, skip_inodes, entry_path, restrict_dev, + keep_tag_files, skip_inodes, normpath, restrict_dev, read_special=read_special, dry_run=dry_run) elif stat.S_ISLNK(st.st_mode): if not dry_run: @@ -461,7 +463,7 @@ def get_mode(item): return [None] def has_hardlink_master(item, hardlink_masters): - return item.get(b'source') in hardlink_masters and get_mode(item)[0] != 'l' + return stat.S_ISREG(item[b'mode']) and item.get(b'source') in hardlink_masters def compare_link(item1, item2): # These are the simple link cases. For special cases, e.g. if a @@ -524,9 +526,6 @@ def compare_items(output, path, item1, item2, hardlink_masters, deleted=False): """ changes = [] - if item1.get(b'hardlink_master') or item2.get(b'hardlink_master'): - hardlink_masters[path] = (item1, item2) - if has_hardlink_master(item1, hardlink_masters): item1 = hardlink_masters[item1[b'source']][0] @@ -559,8 +558,26 @@ def print_output(line): print("{:<19} {}".format(line[1], line[0])) def compare_archives(archive1, archive2, matcher): + def hardlink_master_seen(item): + return b'source' not in item or not stat.S_ISREG(item[b'mode']) or item[b'source'] in hardlink_masters + + def is_hardlink_master(item): + return item.get(b'hardlink_master', True) and b'source' not in item + + def update_hardlink_masters(item1, item2): + if is_hardlink_master(item1) or is_hardlink_master(item2): + hardlink_masters[item1[b'path']] = (item1, item2) + + def compare_or_defer(item1, item2): + update_hardlink_masters(item1, item2) + if not hardlink_master_seen(item1) or not hardlink_master_seen(item2): + deferred.append((item1, item2)) + else: + compare_items(output, item1[b'path'], item1, item2, hardlink_masters) + orphans_archive1 = collections.OrderedDict() orphans_archive2 = collections.OrderedDict() + deferred = [] hardlink_masters = {} output = [] @@ -569,31 +586,40 @@ def compare_archives(archive1, archive2, matcher): archive2.iter_items(lambda item: matcher.match(item[b'path'])), ): if item1 and item2 and item1[b'path'] == item2[b'path']: - compare_items(output, item1[b'path'], item1, item2, hardlink_masters) + compare_or_defer(item1, item2) continue if item1: matching_orphan = orphans_archive2.pop(item1[b'path'], None) if matching_orphan: - compare_items(output, item1[b'path'], item1, matching_orphan, hardlink_masters) + compare_or_defer(item1, matching_orphan) else: orphans_archive1[item1[b'path']] = item1 if item2: matching_orphan = orphans_archive1.pop(item2[b'path'], None) if matching_orphan: - compare_items(output, item2[b'path'], matching_orphan, item2, hardlink_masters) + compare_or_defer(matching_orphan, item2) else: orphans_archive2[item2[b'path']] = item2 # At this point orphans_* contain items that had no matching partner in the other archive + deleted_item = { + b'deleted': True, + b'chunks': [], + b'mode': 0, + } for added in orphans_archive2.values(): - compare_items(output, added[b'path'], { - b'deleted': True, - b'chunks': [], - }, added, hardlink_masters, deleted=True) + path = added[b'path'] + deleted_item[b'path'] = path + update_hardlink_masters(deleted_item, added) + compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True) for deleted in orphans_archive1.values(): - compare_items(output, deleted[b'path'], deleted, { - b'deleted': True, - b'chunks': [], - }, hardlink_masters, deleted=True) + path = deleted[b'path'] + deleted_item[b'path'] = path + update_hardlink_masters(deleted, deleted_item) + compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True) + for item1, item2 in deferred: + assert hardlink_master_seen(item1) + assert hardlink_master_seen(item2) + compare_items(output, item1[b'path'], item1, item2, hardlink_masters) for line in sorted(output): print_output(line) diff --git a/borg/helpers.py b/borg/helpers.py index 908b10b78..7c49f41e5 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -6,6 +6,7 @@ import hashlib from itertools import islice import os +import os.path import stat import textwrap import pwd @@ -1340,3 +1341,68 @@ def consume(iterator, n=None): else: # advance to the empty slice starting at position n next(islice(iterator, n, n), None) + +# GenericDirEntry, scandir_generic (c) 2012 Ben Hoyt +# from the python-scandir package (3-clause BSD license, just like us, so no troubles here) +# note: simplified version + + +class GenericDirEntry: + __slots__ = ('name', '_scandir_path', '_path') + + def __init__(self, scandir_path, name): + self._scandir_path = scandir_path + self.name = name + self._path = None + + @property + def path(self): + if self._path is None: + self._path = os.path.join(self._scandir_path, self.name) + return self._path + + def stat(self, follow_symlinks=True): + assert not follow_symlinks + return os.lstat(self.path) + + def _check_type(self, type): + st = self.stat(False) + return stat.S_IFMT(st.st_mode) == type + + def is_dir(self, follow_symlinks=True): + assert not follow_symlinks + return self._check_type(stat.S_IFDIR) + + def is_file(self, follow_symlinks=True): + assert not follow_symlinks + return self._check_type(stat.S_IFREG) + + def is_symlink(self): + return self._check_type(stat.S_IFLNK) + + def inode(self): + st = self.stat(False) + return st.st_ino + + def __repr__(self): + return '<{0}: {1!r}>'.format(self.__class__.__name__, self.path) + + +def scandir_generic(path='.'): + """Like os.listdir(), but yield DirEntry objects instead of returning a list of names.""" + for name in sorted(os.listdir(path)): + yield GenericDirEntry(path, name) + +try: + from os import scandir +except ImportError: + try: + # Try python-scandir on Python 3.4 + from scandir import scandir + except ImportError: + # If python-scandir is not installed, then use a version that is just as slow as listdir. + scandir = scandir_generic + + +def scandir_inorder(path='.'): + return sorted(scandir(path), key=lambda dirent: dirent.inode()) diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 71b80d4d6..93a2a9be1 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -17,7 +17,7 @@ import pytest -from .. import xattr +from .. import xattr, helpers from ..archive import Archive, ChunkBuffer, ArchiveRecreater from ..archiver import Archiver from ..cache import Cache @@ -1314,11 +1314,16 @@ def _test_recreate_interrupt(self, change_args, interrupt_early): assert 'dir2/abcdef' in files assert 'file1' not in files + # The _test_create_interrupt requires a deterministic (alphabetic) order of the files to easily check if + # resumption works correctly. Patch scandir_inorder to work in alphabetic order. + def test_recreate_interrupt(self): - self._test_recreate_interrupt(False, True) + with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic): + self._test_recreate_interrupt(False, True) def test_recreate_interrupt2(self): - self._test_recreate_interrupt(True, False) + with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic): + self._test_recreate_interrupt(True, False) def _test_recreate_chunker_interrupt_patch(self): real_add_chunk = Cache.add_chunk