diff --git a/borg/archive.py b/borg/archive.py index 94bfb5686..8d7e94f95 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -298,7 +298,19 @@ Number of files: {0.stats.nfiles}'''.format( cache.rollback() return stats - def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False): + def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False, + hardlink_masters=None, original_path=None): + """ + Extract archive item. + + :param item: the item to extract + :param restore_attrs: restore file attributes + :param dry_run: do not write any data + :param stdout: write extracted data to stdout + :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) + :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly + :param original_path: b'path' key as stored in archive + """ if dry_run or stdout: if b'chunks' in item: for data in self.pipeline.fetch_many([c[0] for c in item[b'chunks']], is_preloaded=True): @@ -308,6 +320,7 @@ Number of files: {0.stats.nfiles}'''.format( sys.stdout.buffer.flush() return + original_path = original_path or item[b'path'] dest = self.cwd if item[b'path'].startswith('/') or item[b'path'].startswith('..'): raise Exception('Path should be relative and local') @@ -327,25 +340,36 @@ Number of files: {0.stats.nfiles}'''.format( if stat.S_ISREG(mode): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) + # Hard link? if b'source' in item: source = os.path.join(dest, item[b'source']) if os.path.exists(path): os.unlink(path) - os.link(source, path) - else: - with open(path, 'wb') as fd: - ids = [c[0] for c in item[b'chunks']] - for data in self.pipeline.fetch_many(ids, is_preloaded=True): - if sparse and self.zeros.startswith(data): - # all-zero chunk: create a hole in a sparse file - fd.seek(len(data), 1) - else: - fd.write(data) - pos = fd.tell() - fd.truncate(pos) - fd.flush() - self.restore_attrs(path, item, fd=fd.fileno()) + if not hardlink_masters: + os.link(source, path) + return + item[b'chunks'], link_target = hardlink_masters[item[b'source']] + if link_target: + # Hard link was extracted previously, just link + os.link(link_target, path) + return + # Extract chunks, since the item which had the chunks was not extracted + with open(path, 'wb') as fd: + ids = [c[0] for c in item[b'chunks']] + for data in self.pipeline.fetch_many(ids, is_preloaded=True): + if sparse and self.zeros.startswith(data): + # all-zero chunk: create a hole in a sparse file + fd.seek(len(data), 1) + else: + fd.write(data) + pos = fd.tell() + fd.truncate(pos) + fd.flush() + self.restore_attrs(path, item, fd=fd.fileno()) + if hardlink_masters: + # Update master entry with extracted file path, so that following hardlinks don't extract twice. + hardlink_masters[item.get(b'source') or original_path] = (None, path) elif stat.S_ISDIR(mode): if not os.path.exists(path): os.makedirs(path) @@ -527,7 +551,10 @@ Number of files: {0.stats.nfiles}'''.format( source = self.hard_links.get((st.st_ino, st.st_dev)) if (st.st_ino, st.st_dev) in self.hard_links: item = self.stat_attrs(st, path) - item.update({b'path': safe_path, b'source': source}) + item.update({ + b'path': safe_path, + b'source': source, + }) self.add_item(item) status = 'h' # regular file, hardlink (to already seen inodes) return status @@ -549,7 +576,10 @@ Number of files: {0.stats.nfiles}'''.format( status = 'U' # regular file, unchanged else: status = 'A' # regular file, added - item = {b'path': safe_path} + item = { + b'path': safe_path, + b'hardlink_master': st.st_nlink > 1, # item is a hard link and has the chunks + } # Only chunkify the file if needed if chunks is None: fh = Archive._open_rb(path) @@ -587,7 +617,7 @@ Number of files: {0.stats.nfiles}'''.format( # this set must be kept complete, otherwise the RobustUnpacker might malfunction: -ITEM_KEYS = set([b'path', b'source', b'rdev', b'chunks', +ITEM_KEYS = set([b'path', b'source', b'rdev', b'chunks', b'hardlink_master', b'mode', b'user', b'group', b'uid', b'gid', b'mtime', b'atime', b'ctime', b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ]) diff --git a/borg/archiver.py b/borg/archiver.py index 7b99c3bb5..9e07cf032 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -359,8 +359,20 @@ class Archiver: sparse = args.sparse strip_components = args.strip_components dirs = [] - for item in archive.iter_items(lambda item: matcher.match(item[b'path']), preload=True): + partial_extract = not matcher.empty() or strip_components + hardlink_masters = {} if partial_extract else None + + def item_is_hardlink_master(item): + return (partial_extract and stat.S_ISREG(item[b'mode']) and + item.get(b'hardlink_master', True) and b'source' not in item) + + for item in archive.iter_items(preload=True, + filter=lambda item: item_is_hardlink_master(item) or matcher.match(item[b'path'])): orig_path = item[b'path'] + if item_is_hardlink_master(item): + hardlink_masters[orig_path] = (item.get(b'chunks'), item.get(b'source')) + if not matcher.match(item[b'path']): + continue if strip_components: item[b'path'] = os.sep.join(orig_path.split(os.sep)[strip_components:]) if not item[b'path']: @@ -378,7 +390,8 @@ class Archiver: dirs.append(item) archive.extract_item(item, restore_attrs=False) else: - archive.extract_item(item, stdout=stdout, sparse=sparse) + archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters, + original_path=orig_path) except OSError as e: self.print_warning('%s: %s', remove_surrogates(orig_path), e) @@ -1205,6 +1218,15 @@ class Archiver: Both archives need to be in the same repository, and a repository location may only be specified for ARCHIVE1. + For archives created with Borg 1.1 or newer diff automatically detects whether + the archives are created with the same chunker params. If so, only chunk IDs + are compared, which is very fast. + + For archives prior to Borg 1.1 chunk contents are compared by default. + If you did not create the archives with different chunker params, + pass --same-chunker-params. + Note that the chunker params changed from Borg 0.xx to 1.0. + See the output of the "borg help patterns" command for more help on exclude patterns. """) subparser = subparsers.add_parser('diff', parents=[common_parser], @@ -1282,7 +1304,7 @@ class Archiver: See the "borg help patterns" command for more help on exclude patterns. - The following keys are available for --format: + The following keys are available for --format when listing files: """) + ItemFormatter.keys_help() subparser = subparsers.add_parser('list', parents=[common_parser], @@ -1309,7 +1331,7 @@ class Archiver: type=location_validator(), help='repository/archive to list contents of') subparser.add_argument('paths', metavar='PATH', nargs='*', type=str, - help='paths to extract; patterns are supported') + help='paths to list; patterns are supported') mount_epilog = textwrap.dedent(""" This command mounts an archive as a FUSE filesystem. This can be useful for diff --git a/borg/helpers.py b/borg/helpers.py index 0e9cba7de..a53023ae5 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -293,6 +293,9 @@ class PatternMatcher: # Value to return from match function when none of the patterns match. self.fallback = fallback + def empty(self): + return not len(self._items) + def add(self, patterns, value): """Add list of patterns to internal list. The given value is returned from the match function when one of the given patterns matches. @@ -1125,16 +1128,27 @@ class ItemFormatter: 'NL': os.linesep, } KEY_DESCRIPTIONS = { - 'NEWLINE': 'OS dependent line separator', - 'NL': 'alias of NEWLINE', - 'NUL': 'NUL character for creating print0 / xargs -0 like ouput, see bpath', - 'csize': 'compressed size', 'bpath': 'verbatim POSIX path, can contain any character except NUL', 'path': 'path interpreted as text (might be missing non-text characters, see bpath)', 'source': 'link target for links (identical to linktarget)', + 'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links', + + 'csize': 'compressed size', 'num_chunks': 'number of chunks in this file', 'unique_chunks': 'number of unique chunks in this file', + + 'NEWLINE': 'OS dependent line separator', + 'NL': 'alias of NEWLINE', + 'NUL': 'NUL character for creating print0 / xargs -0 like ouput, see bpath', } + KEY_GROUPS = ( + ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget'), + ('size', 'csize', 'num_chunks', 'unique_chunks'), + ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), + tuple(sorted(hashlib.algorithms_guaranteed)), + ('archiveid', 'archivename', 'extra'), + ('NEWLINE', 'NL', 'NUL', 'SPACE', 'TAB', 'CR', 'LF'), + ) @classmethod def available_keys(cls): @@ -1149,16 +1163,21 @@ class ItemFormatter: keys = [] keys.extend(formatter.call_keys.keys()) keys.extend(formatter.get_item_data(fake_item).keys()) - return sorted(keys, key=lambda s: (s.isupper(), s)) + return keys @classmethod def keys_help(cls): help = [] - for key in cls.available_keys(): - text = " - " + key - if key in cls.KEY_DESCRIPTIONS: - text += ": " + cls.KEY_DESCRIPTIONS[key] - help.append(text) + keys = cls.available_keys() + for group in cls.KEY_GROUPS: + for key in group: + keys.remove(key) + text = " - " + key + if key in cls.KEY_DESCRIPTIONS: + text += ": " + cls.KEY_DESCRIPTIONS[key] + help.append(text) + help.append("") + assert not keys, str(keys) return "\n".join(help) def __init__(self, archive, format): diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index 924a1e540..99f85a3b9 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -467,6 +467,49 @@ class ArchiverTestCase(ArchiverTestCaseBase): with self.assert_creates_file('input/dir/file'): self.cmd('extract', self.repository_location + '::test', '--strip-components', '0') + def _extract_hardlinks_setup(self): + os.mkdir(os.path.join(self.input_path, 'dir1')) + os.mkdir(os.path.join(self.input_path, 'dir1/subdir')) + + self.create_regular_file('source') + os.link(os.path.join(self.input_path, 'source'), + os.path.join(self.input_path, 'abba')) + os.link(os.path.join(self.input_path, 'source'), + os.path.join(self.input_path, 'dir1/hardlink')) + os.link(os.path.join(self.input_path, 'source'), + os.path.join(self.input_path, 'dir1/subdir/hardlink')) + + self.create_regular_file('dir1/source2') + os.link(os.path.join(self.input_path, 'dir1/source2'), + os.path.join(self.input_path, 'dir1/aaaa')) + + self.cmd('init', self.repository_location) + self.cmd('create', self.repository_location + '::test', 'input') + + def test_strip_components_links(self): + self._extract_hardlinks_setup() + with changedir('output'): + self.cmd('extract', self.repository_location + '::test', '--strip-components', '2') + assert os.stat('hardlink').st_nlink == 2 + assert os.stat('subdir/hardlink').st_nlink == 2 + assert os.stat('aaaa').st_nlink == 2 + assert os.stat('source2').st_nlink == 2 + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + assert os.stat('input/dir1/hardlink').st_nlink == 4 + + def test_extract_hardlinks(self): + self._extract_hardlinks_setup() + with changedir('output'): + self.cmd('extract', self.repository_location + '::test', 'input/dir1') + assert os.stat('input/dir1/hardlink').st_nlink == 2 + assert os.stat('input/dir1/subdir/hardlink').st_nlink == 2 + assert os.stat('input/dir1/aaaa').st_nlink == 2 + assert os.stat('input/dir1/source2').st_nlink == 2 + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + assert os.stat('input/dir1/hardlink').st_nlink == 4 + def test_extract_include_exclude(self): self.cmd('init', self.repository_location) self.create_regular_file('file1', size=1024 * 80) diff --git a/docs/usage.rst b/docs/usage.rst index b8284e254..70db163d6 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -374,6 +374,52 @@ Examples ... + +.. include:: usage/diff.rst.inc + +Examples +~~~~~~~~ +:: + + $ borg init testrepo + $ mkdir testdir + $ cd testdir + $ echo asdf > file1 + $ dd if=/dev/urandom bs=1M count=4 > file2 + $ touch file3 + $ borg create ../testrepo::archive1 . + + $ chmod a+x file1 + $ echo "something" >> file2 + $ borg create ../testrepo::archive2 . + + $ rm file3 + $ touch file4 + $ borg create ../testrepo::archive3 . + + $ cd .. + $ borg diff testrepo::archive1 archive2 + file1 different mode + archive1 -rw-r--r-- + archive2 -rwxr-xr-x + file2 different contents + +28 B, -31 B, 4.19 MB, 4.19 MB + + $ borg diff testrepo::archive2 archive3 + file3 different contents + +0 B, -0 B, 0 B, + + $ borg diff testrepo::archive1 archive3 + file1 different mode + archive1 -rw-r--r-- + archive3 -rwxr-xr-x + file2 different contents + +28 B, -31 B, 4.19 MB, 4.19 MB + file3 different contents + +0 B, -0 B, 0 B, + file4 different contents + +0 B, -0 B, , 0 B + .. include:: usage/delete.rst.inc Examples diff --git a/docs/usage/diff.rst.inc b/docs/usage/diff.rst.inc index a74ef35ac..ba07b8203 100644 --- a/docs/usage/diff.rst.inc +++ b/docs/usage/diff.rst.inc @@ -48,4 +48,13 @@ This command finds differences in files (contents, user, group, mode) between ar Both archives need to be in the same repository, and a repository location may only be specified for ARCHIVE1. +For archives created with Borg 1.1 or newer diff automatically detects whether +the archives are created with the same chunker params. If so, only chunk IDs +are compared, which is very fast. + +For archives prior to Borg 1.1 chunk contents are compared by default. +If you did not create the archives with different chunker params, +pass --same-chunker-params. +Note that the chunker params changed from Borg 0.xx to 1.0. + See the output of the "borg help patterns" command for more help on exclude patterns. diff --git a/docs/usage/list.rst.inc b/docs/usage/list.rst.inc index 02e40b70a..b1996a6b5 100644 --- a/docs/usage/list.rst.inc +++ b/docs/usage/list.rst.inc @@ -6,15 +6,16 @@ borg list usage: borg list [-h] [-v] [--debug] [--lock-wait N] [--show-version] [--show-rc] [--no-files-cache] [--umask M] - [--remote-path PATH] [--short] [--list-format LISTFORMAT] - [-P PREFIX] - [REPOSITORY_OR_ARCHIVE] + [--remote-path PATH] [--short] [--format FORMAT] [-P PREFIX] + [-e PATTERN] [--exclude-from EXCLUDEFILE] + [REPOSITORY_OR_ARCHIVE] [PATH [PATH ...]] List archive or repository contents positional arguments: REPOSITORY_OR_ARCHIVE repository/archive to list contents of + PATH paths to list; patterns are supported optional arguments: -h, --help show this help message and exit @@ -30,15 +31,64 @@ borg list --umask M set umask to M (local and remote, default: 0077) --remote-path PATH set remote path to executable (default: "borg") --short only print file/directory names, nothing else - --list-format LISTFORMAT - specify format for archive file listing (default: - "{mode} {user:6} {group:6} {size:8d} {isomtime} - {path}{extra}{NEWLINE}") Special "{formatkeys}" exists - to list available keys + --format FORMAT, --list-format FORMAT + specify format for file listing (default: "{mode} + {user:6} {group:6} {size:8d} {isomtime} + {path}{extra}{NL}") -P PREFIX, --prefix PREFIX only consider archive names starting with this prefix + -e PATTERN, --exclude PATTERN + exclude paths matching PATTERN + --exclude-from EXCLUDEFILE + read exclude patterns from EXCLUDEFILE, one per line Description ~~~~~~~~~~~ This command lists the contents of a repository or an archive. + +See the "borg help patterns" command for more help on exclude patterns. + +The following keys are available for --format when listing files: + + - type + - mode + - uid + - gid + - user + - group + - path: path interpreted as text (might be missing non-text characters, see bpath) + - bpath: verbatim POSIX path, can contain any character except NUL + - source: link target for links (identical to linktarget) + - linktarget + + - size + - csize: compressed size + - num_chunks: number of chunks in this file + - unique_chunks: number of unique chunks in this file + + - mtime + - ctime + - atime + - isomtime + - isoctime + - isoatime + + - md5 + - sha1 + - sha224 + - sha256 + - sha384 + - sha512 + + - archiveid + - archivename + - extra: prepends {source} with " -> " for soft links and " link to " for hard links + + - NEWLINE: OS dependent line separator + - NL: alias of NEWLINE + - NUL: NUL character for creating print0 / xargs -0 like ouput, see bpath + - SPACE + - TAB + - CR + - LF