create: Visit files in inode order

2025-02-24 15:12:00 +00:00 · 2016-04-17 16:41:03 +02:00 · 2016-04-17 16:41:03 +02:00 · f13aef5d83
commit f13aef5d83
parent 5b6377e0c2
3 changed files with 132 additions and 35 deletions
--- a/borg/archiver.py
+++ b/borg/archiver.py
@ -25,6 +25,7 @@
    log_multi, PatternMatcher, ItemFormatter
 from .logger import create_logger, setup_logging
 logger = create_logger()
 from . import helpers
 from .compress import Compressor, COMPR_BUFFER
 from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
 from .repository import Repository
@ -247,17 +248,18 @@ def create_inner(archive, cache):
                    self.print_file_status(status, path)
                    continue
                path = os.path.normpath(path)
                if args.one_file_system:
                try:
-                        restrict_dev = os.lstat(path).st_dev
+                    st = os.lstat(path)
                except OSError as e:
                    self.print_warning('%s: %s', path, e)
                    continue
                if args.one_file_system:
                    restrict_dev = st.st_dev
                else:
                    restrict_dev = None
                self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present,
                              args.keep_tag_files, skip_inodes, path, restrict_dev,
-                              read_special=args.read_special, dry_run=dry_run)
+                              read_special=args.read_special, dry_run=dry_run, st=st)
            if not dry_run:
                archive.save(comment=args.comment, timestamp=args.timestamp)
                if args.progress:
@ -292,11 +294,11 @@ def create_inner(archive, cache):
    def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
                 keep_tag_files, skip_inodes, path, restrict_dev,
-                 read_special=False, dry_run=False):
+                 read_special=False, dry_run=False, st=None):
        if not matcher.match(path):
            self.print_file_status('x', path)
            return
-
+        if st is None:
            try:
                st = os.lstat(path)
            except OSError as e:
@ -331,15 +333,15 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
            if not dry_run:
                status = archive.process_dir(path, st)
            try:
-                entries = os.listdir(path)
+                entries = helpers.scandir_inorder(path)
            except OSError as e:
                status = 'E'
                self.print_warning('%s: %s', path, e)
            else:
-                for filename in sorted(entries):
+                for dirent in entries:
-                    entry_path = os.path.normpath(os.path.join(path, filename))
+                    normpath = os.path.normpath(dirent.path)
                    self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
-                                  keep_tag_files, skip_inodes, entry_path, restrict_dev,
+                                  keep_tag_files, skip_inodes, normpath, restrict_dev,
                                  read_special=read_special, dry_run=dry_run)
        elif stat.S_ISLNK(st.st_mode):
            if not dry_run:
@ -461,7 +463,7 @@ def get_mode(item):
                return [None]
        def has_hardlink_master(item, hardlink_masters):
-            return item.get(b'source') in hardlink_masters and get_mode(item)[0] != 'l'
+            return stat.S_ISREG(item[b'mode']) and item.get(b'source') in hardlink_masters
        def compare_link(item1, item2):
            # These are the simple link cases. For special cases, e.g. if a
@ -524,9 +526,6 @@ def compare_items(output, path, item1, item2, hardlink_masters, deleted=False):
            """
            changes = []
            if item1.get(b'hardlink_master') or item2.get(b'hardlink_master'):
                hardlink_masters[path] = (item1, item2)
            if has_hardlink_master(item1, hardlink_masters):
                item1 = hardlink_masters[item1[b'source']][0]
@ -559,8 +558,26 @@ def print_output(line):
            print("{:<19} {}".format(line[1], line[0]))
        def compare_archives(archive1, archive2, matcher):
            def hardlink_master_seen(item):
                return b'source' not in item or not stat.S_ISREG(item[b'mode']) or item[b'source'] in hardlink_masters
            def is_hardlink_master(item):
                return item.get(b'hardlink_master', True) and b'source' not in item
            def update_hardlink_masters(item1, item2):
                if is_hardlink_master(item1) or is_hardlink_master(item2):
                    hardlink_masters[item1[b'path']] = (item1, item2)
            def compare_or_defer(item1, item2):
                update_hardlink_masters(item1, item2)
                if not hardlink_master_seen(item1) or not hardlink_master_seen(item2):
                    deferred.append((item1, item2))
                else:
                    compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
            orphans_archive1 = collections.OrderedDict()
            orphans_archive2 = collections.OrderedDict()
            deferred = []
            hardlink_masters = {}
            output = []
@ -569,31 +586,40 @@ def compare_archives(archive1, archive2, matcher):
                    archive2.iter_items(lambda item: matcher.match(item[b'path'])),
            ):
                if item1 and item2 and item1[b'path'] == item2[b'path']:
-                    compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
+                    compare_or_defer(item1, item2)
                    continue
                if item1:
                    matching_orphan = orphans_archive2.pop(item1[b'path'], None)
                    if matching_orphan:
-                        compare_items(output, item1[b'path'], item1, matching_orphan, hardlink_masters)
+                        compare_or_defer(item1, matching_orphan)
                    else:
                        orphans_archive1[item1[b'path']] = item1
                if item2:
                    matching_orphan = orphans_archive1.pop(item2[b'path'], None)
                    if matching_orphan:
-                        compare_items(output, item2[b'path'], matching_orphan, item2, hardlink_masters)
+                        compare_or_defer(matching_orphan, item2)
                    else:
                        orphans_archive2[item2[b'path']] = item2
            # At this point orphans_* contain items that had no matching partner in the other archive
            deleted_item = {
                b'deleted': True,
                b'chunks': [],
                b'mode': 0,
            }
            for added in orphans_archive2.values():
-                compare_items(output, added[b'path'], {
+                path = added[b'path']
-                    b'deleted': True,
+                deleted_item[b'path'] = path
-                    b'chunks': [],
+                update_hardlink_masters(deleted_item, added)
-                }, added, hardlink_masters, deleted=True)
+                compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True)
            for deleted in orphans_archive1.values():
-                compare_items(output, deleted[b'path'], deleted, {
+                path = deleted[b'path']
-                    b'deleted': True,
+                deleted_item[b'path'] = path
-                    b'chunks': [],
+                update_hardlink_masters(deleted, deleted_item)
-                }, hardlink_masters, deleted=True)
+                compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True)
            for item1, item2 in deferred:
                assert hardlink_master_seen(item1)
                assert hardlink_master_seen(item2)
                compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
            for line in sorted(output):
                print_output(line)
--- a/borg/helpers.py
+++ b/borg/helpers.py
@ -6,6 +6,7 @@
 import hashlib
 from itertools import islice
 import os
 import os.path
 import stat
 import textwrap
 import pwd
@ -1340,3 +1341,68 @@ def consume(iterator, n=None):
    else:
        # advance to the empty slice starting at position n
        next(islice(iterator, n, n), None)
 # GenericDirEntry, scandir_generic (c) 2012 Ben Hoyt
 # from the python-scandir package (3-clause BSD license, just like us, so no troubles here)
 # note: simplified version
 class GenericDirEntry:
    __slots__ = ('name', '_scandir_path', '_path')
    def __init__(self, scandir_path, name):
        self._scandir_path = scandir_path
        self.name = name
        self._path = None
    @property
    def path(self):
        if self._path is None:
            self._path = os.path.join(self._scandir_path, self.name)
        return self._path
    def stat(self, follow_symlinks=True):
        assert not follow_symlinks
        return os.lstat(self.path)
    def _check_type(self, type):
        st = self.stat(False)
        return stat.S_IFMT(st.st_mode) == type
    def is_dir(self, follow_symlinks=True):
        assert not follow_symlinks
        return self._check_type(stat.S_IFDIR)
    def is_file(self, follow_symlinks=True):
        assert not follow_symlinks
        return self._check_type(stat.S_IFREG)
    def is_symlink(self):
        return self._check_type(stat.S_IFLNK)
    def inode(self):
        st = self.stat(False)
        return st.st_ino
    def __repr__(self):
        return '<{0}: {1!r}>'.format(self.__class__.__name__, self.path)
 def scandir_generic(path='.'):
    """Like os.listdir(), but yield DirEntry objects instead of returning a list of names."""
    for name in sorted(os.listdir(path)):
        yield GenericDirEntry(path, name)
 try:
    from os import scandir
 except ImportError:
    try:
        # Try python-scandir on Python 3.4
        from scandir import scandir
    except ImportError:
        # If python-scandir is not installed, then use a version that is just as slow as listdir.
        scandir = scandir_generic
 def scandir_inorder(path='.'):
    return sorted(scandir(path), key=lambda dirent: dirent.inode())
--- a/borg/testsuite/archiver.py
+++ b/borg/testsuite/archiver.py
@ -17,7 +17,7 @@
 import pytest
-from .. import xattr
+from .. import xattr, helpers
 from ..archive import Archive, ChunkBuffer, ArchiveRecreater
 from ..archiver import Archiver
 from ..cache import Cache
@ -1314,10 +1314,15 @@ def _test_recreate_interrupt(self, change_args, interrupt_early):
        assert 'dir2/abcdef' in files
        assert 'file1' not in files
    # The _test_create_interrupt requires a deterministic (alphabetic) order of the files to easily check if
    # resumption works correctly. Patch scandir_inorder to work in alphabetic order.
    def test_recreate_interrupt(self):
        with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
            self._test_recreate_interrupt(False, True)
    def test_recreate_interrupt2(self):
        with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
            self._test_recreate_interrupt(True, False)
    def _test_recreate_chunker_interrupt_patch(self):