1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2025-02-24 15:12:00 +00:00

create: Visit files in inode order

This commit is contained in:
Marian Beermann 2016-04-17 16:41:03 +02:00
parent 5b6377e0c2
commit f13aef5d83
No known key found for this signature in database
GPG key ID: 9B8450B91D1362C1
3 changed files with 132 additions and 35 deletions

View file

@ -25,6 +25,7 @@
log_multi, PatternMatcher, ItemFormatter log_multi, PatternMatcher, ItemFormatter
from .logger import create_logger, setup_logging from .logger import create_logger, setup_logging
logger = create_logger() logger = create_logger()
from . import helpers
from .compress import Compressor, COMPR_BUFFER from .compress import Compressor, COMPR_BUFFER
from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
from .repository import Repository from .repository import Repository
@ -247,17 +248,18 @@ def create_inner(archive, cache):
self.print_file_status(status, path) self.print_file_status(status, path)
continue continue
path = os.path.normpath(path) path = os.path.normpath(path)
if args.one_file_system:
try: try:
restrict_dev = os.lstat(path).st_dev st = os.lstat(path)
except OSError as e: except OSError as e:
self.print_warning('%s: %s', path, e) self.print_warning('%s: %s', path, e)
continue continue
if args.one_file_system:
restrict_dev = st.st_dev
else: else:
restrict_dev = None restrict_dev = None
self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present, self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present,
args.keep_tag_files, skip_inodes, path, restrict_dev, args.keep_tag_files, skip_inodes, path, restrict_dev,
read_special=args.read_special, dry_run=dry_run) read_special=args.read_special, dry_run=dry_run, st=st)
if not dry_run: if not dry_run:
archive.save(comment=args.comment, timestamp=args.timestamp) archive.save(comment=args.comment, timestamp=args.timestamp)
if args.progress: if args.progress:
@ -292,11 +294,11 @@ def create_inner(archive, cache):
def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present, def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
keep_tag_files, skip_inodes, path, restrict_dev, keep_tag_files, skip_inodes, path, restrict_dev,
read_special=False, dry_run=False): read_special=False, dry_run=False, st=None):
if not matcher.match(path): if not matcher.match(path):
self.print_file_status('x', path) self.print_file_status('x', path)
return return
if st is None:
try: try:
st = os.lstat(path) st = os.lstat(path)
except OSError as e: except OSError as e:
@ -331,15 +333,15 @@ def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
if not dry_run: if not dry_run:
status = archive.process_dir(path, st) status = archive.process_dir(path, st)
try: try:
entries = os.listdir(path) entries = helpers.scandir_inorder(path)
except OSError as e: except OSError as e:
status = 'E' status = 'E'
self.print_warning('%s: %s', path, e) self.print_warning('%s: %s', path, e)
else: else:
for filename in sorted(entries): for dirent in entries:
entry_path = os.path.normpath(os.path.join(path, filename)) normpath = os.path.normpath(dirent.path)
self._process(archive, cache, matcher, exclude_caches, exclude_if_present, self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
keep_tag_files, skip_inodes, entry_path, restrict_dev, keep_tag_files, skip_inodes, normpath, restrict_dev,
read_special=read_special, dry_run=dry_run) read_special=read_special, dry_run=dry_run)
elif stat.S_ISLNK(st.st_mode): elif stat.S_ISLNK(st.st_mode):
if not dry_run: if not dry_run:
@ -461,7 +463,7 @@ def get_mode(item):
return [None] return [None]
def has_hardlink_master(item, hardlink_masters): def has_hardlink_master(item, hardlink_masters):
return item.get(b'source') in hardlink_masters and get_mode(item)[0] != 'l' return stat.S_ISREG(item[b'mode']) and item.get(b'source') in hardlink_masters
def compare_link(item1, item2): def compare_link(item1, item2):
# These are the simple link cases. For special cases, e.g. if a # These are the simple link cases. For special cases, e.g. if a
@ -524,9 +526,6 @@ def compare_items(output, path, item1, item2, hardlink_masters, deleted=False):
""" """
changes = [] changes = []
if item1.get(b'hardlink_master') or item2.get(b'hardlink_master'):
hardlink_masters[path] = (item1, item2)
if has_hardlink_master(item1, hardlink_masters): if has_hardlink_master(item1, hardlink_masters):
item1 = hardlink_masters[item1[b'source']][0] item1 = hardlink_masters[item1[b'source']][0]
@ -559,8 +558,26 @@ def print_output(line):
print("{:<19} {}".format(line[1], line[0])) print("{:<19} {}".format(line[1], line[0]))
def compare_archives(archive1, archive2, matcher): def compare_archives(archive1, archive2, matcher):
def hardlink_master_seen(item):
return b'source' not in item or not stat.S_ISREG(item[b'mode']) or item[b'source'] in hardlink_masters
def is_hardlink_master(item):
return item.get(b'hardlink_master', True) and b'source' not in item
def update_hardlink_masters(item1, item2):
if is_hardlink_master(item1) or is_hardlink_master(item2):
hardlink_masters[item1[b'path']] = (item1, item2)
def compare_or_defer(item1, item2):
update_hardlink_masters(item1, item2)
if not hardlink_master_seen(item1) or not hardlink_master_seen(item2):
deferred.append((item1, item2))
else:
compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
orphans_archive1 = collections.OrderedDict() orphans_archive1 = collections.OrderedDict()
orphans_archive2 = collections.OrderedDict() orphans_archive2 = collections.OrderedDict()
deferred = []
hardlink_masters = {} hardlink_masters = {}
output = [] output = []
@ -569,31 +586,40 @@ def compare_archives(archive1, archive2, matcher):
archive2.iter_items(lambda item: matcher.match(item[b'path'])), archive2.iter_items(lambda item: matcher.match(item[b'path'])),
): ):
if item1 and item2 and item1[b'path'] == item2[b'path']: if item1 and item2 and item1[b'path'] == item2[b'path']:
compare_items(output, item1[b'path'], item1, item2, hardlink_masters) compare_or_defer(item1, item2)
continue continue
if item1: if item1:
matching_orphan = orphans_archive2.pop(item1[b'path'], None) matching_orphan = orphans_archive2.pop(item1[b'path'], None)
if matching_orphan: if matching_orphan:
compare_items(output, item1[b'path'], item1, matching_orphan, hardlink_masters) compare_or_defer(item1, matching_orphan)
else: else:
orphans_archive1[item1[b'path']] = item1 orphans_archive1[item1[b'path']] = item1
if item2: if item2:
matching_orphan = orphans_archive1.pop(item2[b'path'], None) matching_orphan = orphans_archive1.pop(item2[b'path'], None)
if matching_orphan: if matching_orphan:
compare_items(output, item2[b'path'], matching_orphan, item2, hardlink_masters) compare_or_defer(matching_orphan, item2)
else: else:
orphans_archive2[item2[b'path']] = item2 orphans_archive2[item2[b'path']] = item2
# At this point orphans_* contain items that had no matching partner in the other archive # At this point orphans_* contain items that had no matching partner in the other archive
deleted_item = {
b'deleted': True,
b'chunks': [],
b'mode': 0,
}
for added in orphans_archive2.values(): for added in orphans_archive2.values():
compare_items(output, added[b'path'], { path = added[b'path']
b'deleted': True, deleted_item[b'path'] = path
b'chunks': [], update_hardlink_masters(deleted_item, added)
}, added, hardlink_masters, deleted=True) compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True)
for deleted in orphans_archive1.values(): for deleted in orphans_archive1.values():
compare_items(output, deleted[b'path'], deleted, { path = deleted[b'path']
b'deleted': True, deleted_item[b'path'] = path
b'chunks': [], update_hardlink_masters(deleted, deleted_item)
}, hardlink_masters, deleted=True) compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True)
for item1, item2 in deferred:
assert hardlink_master_seen(item1)
assert hardlink_master_seen(item2)
compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
for line in sorted(output): for line in sorted(output):
print_output(line) print_output(line)

View file

@ -6,6 +6,7 @@
import hashlib import hashlib
from itertools import islice from itertools import islice
import os import os
import os.path
import stat import stat
import textwrap import textwrap
import pwd import pwd
@ -1340,3 +1341,68 @@ def consume(iterator, n=None):
else: else:
# advance to the empty slice starting at position n # advance to the empty slice starting at position n
next(islice(iterator, n, n), None) next(islice(iterator, n, n), None)
# GenericDirEntry, scandir_generic (c) 2012 Ben Hoyt
# from the python-scandir package (3-clause BSD license, just like us, so no troubles here)
# note: simplified version
class GenericDirEntry:
__slots__ = ('name', '_scandir_path', '_path')
def __init__(self, scandir_path, name):
self._scandir_path = scandir_path
self.name = name
self._path = None
@property
def path(self):
if self._path is None:
self._path = os.path.join(self._scandir_path, self.name)
return self._path
def stat(self, follow_symlinks=True):
assert not follow_symlinks
return os.lstat(self.path)
def _check_type(self, type):
st = self.stat(False)
return stat.S_IFMT(st.st_mode) == type
def is_dir(self, follow_symlinks=True):
assert not follow_symlinks
return self._check_type(stat.S_IFDIR)
def is_file(self, follow_symlinks=True):
assert not follow_symlinks
return self._check_type(stat.S_IFREG)
def is_symlink(self):
return self._check_type(stat.S_IFLNK)
def inode(self):
st = self.stat(False)
return st.st_ino
def __repr__(self):
return '<{0}: {1!r}>'.format(self.__class__.__name__, self.path)
def scandir_generic(path='.'):
"""Like os.listdir(), but yield DirEntry objects instead of returning a list of names."""
for name in sorted(os.listdir(path)):
yield GenericDirEntry(path, name)
try:
from os import scandir
except ImportError:
try:
# Try python-scandir on Python 3.4
from scandir import scandir
except ImportError:
# If python-scandir is not installed, then use a version that is just as slow as listdir.
scandir = scandir_generic
def scandir_inorder(path='.'):
return sorted(scandir(path), key=lambda dirent: dirent.inode())

View file

@ -17,7 +17,7 @@
import pytest import pytest
from .. import xattr from .. import xattr, helpers
from ..archive import Archive, ChunkBuffer, ArchiveRecreater from ..archive import Archive, ChunkBuffer, ArchiveRecreater
from ..archiver import Archiver from ..archiver import Archiver
from ..cache import Cache from ..cache import Cache
@ -1314,10 +1314,15 @@ def _test_recreate_interrupt(self, change_args, interrupt_early):
assert 'dir2/abcdef' in files assert 'dir2/abcdef' in files
assert 'file1' not in files assert 'file1' not in files
# The _test_create_interrupt requires a deterministic (alphabetic) order of the files to easily check if
# resumption works correctly. Patch scandir_inorder to work in alphabetic order.
def test_recreate_interrupt(self): def test_recreate_interrupt(self):
with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
self._test_recreate_interrupt(False, True) self._test_recreate_interrupt(False, True)
def test_recreate_interrupt2(self): def test_recreate_interrupt2(self):
with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
self._test_recreate_interrupt(True, False) self._test_recreate_interrupt(True, False)
def _test_recreate_chunker_interrupt_patch(self): def _test_recreate_chunker_interrupt_patch(self):