From 2bafece093b6c261f5124f59e9f1c2e53c68bd61 Mon Sep 17 00:00:00 2001 From: Michael Hanselmann Date: Wed, 16 Dec 2015 00:14:02 +0100 Subject: [PATCH] Implement exclusions using regular expressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing option to exclude files and directories, “--exclude”, is implemented using fnmatch[1]. fnmatch matches the slash (“/”) with “*” and thus makes it impossible to write patterns where a directory with a given name should be excluded at a specific depth in the directory hierarchy, but not anywhere else. Consider this structure: home/ home/aaa home/aaa/.thumbnails home/user home/user/img home/user/img/.thumbnails fnmatch incorrectly excludes “home/user/img/.thumbnails” with a pattern of “home/*/.thumbnails” when the intention is to exclude “.thumbnails” in all home directories while retaining directories with the same name in all other locations. With this change regular expressions are introduced as an additional pattern syntax. The syntax is selected using a prefix on “--exclude”'s value. “re:” is for regular expression and “fm:”, the default, selects fnmatch. Selecting the syntax is necessary when regular expressions are desired or when the desired fnmatch pattern starts with two alphanumeric characters followed by a colon (i.e. “aa:something/*”). The exclusion described above can be implemented as follows: --exclude 're:^home/[^/]+/\.thumbnails$' The “--exclude-from” option permits loading exclusions from a text file where the same prefixes can now be used, e.g. “re:\.tmp$”. The documentation has been extended and now not only describes the two pattern styles, but also the file format supported by “--exclude-from”. This change has been discussed in issue #43 and in change request #497. [1] https://docs.python.org/3/library/fnmatch.html Signed-off-by: Michael Hanselmann --- borg/archiver.py | 68 ++++++++++++++++++----- borg/helpers.py | 42 +++++++++++++- borg/testsuite/archiver.py | 73 ++++++++++++++++++++++++ borg/testsuite/helpers.py | 111 +++++++++++++++++++++++++++++++++++-- docs/usage.rst | 5 ++ 5 files changed, 278 insertions(+), 21 deletions(-) diff --git a/borg/archiver.py b/borg/archiver.py index e3abe1026..beee16052 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -17,7 +17,7 @@ import traceback from . import __version__ from .helpers import Error, location_validator, format_time, format_file_size, \ - format_file_mode, ExcludePattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ + format_file_mode, parse_pattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ get_cache_dir, get_keys_dir, prune_within, prune_split, unhexlify, \ Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \ dir_is_tagged, bigint_to_int, ChunkerParams, CompressionSpec, is_slow_msgpack, yes, sysinfo, \ @@ -598,17 +598,43 @@ class Archiver: helptext = {} helptext['patterns'] = textwrap.dedent(''' - Exclude patterns use a variant of shell pattern syntax, with '*' matching any - number of characters, '?' matching any single character, '[...]' matching any - single character specified, including ranges, and '[!...]' matching any - character not specified. For the purpose of these patterns, the path - separator ('\\' for Windows and '/' on other systems) is not treated - specially. For a path to match a pattern, it must completely match from - start to end, or must match from the start to just before a path separator. - Except for the root path, paths will never end in the path separator when - matching is attempted. Thus, if a given pattern ends in a path separator, a - '*' is appended before matching is attempted. Patterns with wildcards should - be quoted to protect them from shell expansion. + Exclusion patterns support two separate styles, fnmatch and regular + expressions. If followed by a colon (':') the first two characters of + a pattern are used as a style selector. Explicit style selection is necessary + when regular expressions are desired or when the desired fnmatch pattern + starts with two alphanumeric characters followed by a colon (i.e. + `aa:something/*`). + + `Fnmatch `_ patterns use + a variant of shell pattern syntax, with '*' matching any number of + characters, '?' matching any single character, '[...]' matching any single + character specified, including ranges, and '[!...]' matching any character + not specified. The style selector is `fm`. For the purpose of these patterns, + the path separator ('\\' for Windows and '/' on other systems) is not treated + specially. For a path to match a pattern, it must completely match from start + to end, or must match from the start to just before a path separator. Except + for the root path, paths will never end in the path separator when matching + is attempted. Thus, if a given pattern ends in a path separator, a '*' is + appended before matching is attempted. + + Regular expressions similar to those found in Perl are supported with the + selection prefix `re:`. Unlike shell patterns regular expressions are not + required to match the complete path and any substring match is sufficient. It + is strongly recommended to anchor patterns to the start ('^'), to the end + ('$') or both. Path separators ('\\' for Windows and '/' on other systems) in + paths are always normalized to a forward slash ('/') before applying + a pattern. The regular expression syntax is described in the `Python + documentation for the re module + `_. + + Exclusions can be passed via the command line option `--exclude`. When used + from within a shell the patterns should be quoted to protect them from + expansion. + + The `--exclude-from` option permits loading exclusion patterns from a text + file with one pattern per line. Empty lines as well as lines starting with + the number sign ('#') are ignored. The optional style selector prefix is + also supported for patterns loaded from a file. Examples: @@ -624,6 +650,20 @@ class Archiver: # The file '/home/user/cache/important' is *not* backed up: $ borg create -e /home/user/cache/ backup / /home/user/cache/important + + # The contents of directories in '/home' are not backed up when their name + # ends in '.tmp' + $ borg create --exclude 're:^/home/[^/]+\.tmp/' backup / + + # Load exclusions from file + $ cat >exclude.txt < 2 and pattern[2] == ":" and pattern[:2].isalnum(): + (style, pattern) = (pattern[:2], pattern[3:]) + else: + style = _DEFAULT_PATTERN_STYLE + + cls = _PATTERN_STYLES.get(style, None) + + if cls is None: + raise ValueError("Unknown pattern style: {}".format(style)) + + return cls(pattern) + + def timestamp(s): """Convert a --timestamp=s argument to a datetime object""" try: diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index adb05e101..58d20c528 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -489,6 +489,79 @@ class ArchiverTestCase(ArchiverTestCaseBase): self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file3']) + def test_extract_include_exclude_regex(self): + self.cmd('init', self.repository_location) + self.create_regular_file('file1', size=1024 * 80) + self.create_regular_file('file2', size=1024 * 80) + self.create_regular_file('file3', size=1024 * 80) + self.create_regular_file('file4', size=1024 * 80) + self.create_regular_file('file333', size=1024 * 80) + + # Create with regular expression exclusion for file4 + self.cmd('create', '--exclude=re:input/file4$', self.repository_location + '::test', 'input') + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333']) + shutil.rmtree('output/input') + + # Extract with regular expression exclusion + with changedir('output'): + self.cmd('extract', '--exclude=re:file3+', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2']) + shutil.rmtree('output/input') + + # Combine --exclude with fnmatch and regular expression + with changedir('output'): + self.cmd('extract', '--exclude=input/file2', '--exclude=re:file[01]', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3', 'file333']) + shutil.rmtree('output/input') + + # Combine --exclude-from and regular expression exclusion + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, '--exclude=re:file1', + '--exclude=re:file(\\d)\\1\\1$', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3']) + + def test_extract_include_exclude_regex_from_file(self): + self.cmd('init', self.repository_location) + self.create_regular_file('file1', size=1024 * 80) + self.create_regular_file('file2', size=1024 * 80) + self.create_regular_file('file3', size=1024 * 80) + self.create_regular_file('file4', size=1024 * 80) + self.create_regular_file('file333', size=1024 * 80) + self.create_regular_file('aa:something', size=1024 * 80) + + # Create while excluding using mixed pattern styles + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:input/file4$\n') + fd.write(b'fm:*aa:*thing\n') + + self.cmd('create', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test', 'input') + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333']) + shutil.rmtree('output/input') + + # Exclude using regular expression + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:file3+\n') + + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2']) + shutil.rmtree('output/input') + + # Mixed exclude pattern styles + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:file(\\d)\\1\\1$\n') + fd.write(b'fm:nothingwillmatchthis\n') + fd.write(b'*/file1\n') + fd.write(b're:file2$\n') + + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3']) + def test_exclude_caches(self): self.cmd('init', self.repository_location) self.create_regular_file('file1', size=1024 * 80) diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index e766ed441..a61bdd28e 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -10,9 +10,9 @@ import msgpack import msgpack.fallback from ..helpers import adjust_patterns, exclude_path, Location, format_file_size, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \ - prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, \ + prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, ExcludeRegex, \ StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \ - ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes + ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern from . import BaseTestCase, environment_variable, FakeInputs @@ -160,6 +160,15 @@ class FormatTimedeltaTestCase(BaseTestCase): ) +def check_patterns(files, paths, excludes, expected): + """Utility for testing exclusion patterns. + """ + patterns = adjust_patterns(paths, excludes) + included = [path for path in files if not exclude_path(path, patterns)] + + assert included == (files if expected is None else expected) + + @pytest.mark.parametrize("paths, excludes, expected", [ # "None" means all files, i.e. none excluded ([], [], None), @@ -184,10 +193,44 @@ def test_patterns(paths, excludes, expected): '/var/log/messages', '/var/log/dmesg', ] - patterns = adjust_patterns(paths, [ExcludePattern(p) for p in excludes]) - included = [path for path in files if not exclude_path(path, patterns)] + check_patterns(files, paths, [ExcludePattern(p) for p in excludes], expected) - assert included == (files if expected is None else expected) + +@pytest.mark.parametrize("paths, excludes, expected", [ + # "None" means all files, i.e. none excluded + ([], [], None), + (['/'], [], None), + (['/'], ['.*'], []), + (['/'], ['^/'], []), + (['/'], ['^abc$'], None), + (['/'], ['^(?!/home/)'], + ['/home/user/.profile', '/home/user/.bashrc', '/home/user2/.profile', + '/home/user2/public_html/index.html']), + ]) +def test_patterns_regex(paths, excludes, expected): + files = [ + '/srv/data', '/foo/bar', '/home', + '/home/user/.profile', '/home/user/.bashrc', + '/home/user2/.profile', '/home/user2/public_html/index.html', + '/opt/log/messages.txt', '/opt/log/dmesg.txt', + ] + + patterns = [] + + for i in excludes: + pat = ExcludeRegex(i) + assert str(pat) == i + assert pat.pattern == i + patterns.append(pat) + + check_patterns(files, paths, patterns, expected) + + +def test_regex_pattern(): + # The forward slash must match the platform-specific path separator + assert ExcludeRegex("^/$").match("/") + assert ExcludeRegex("^/$").match(os.path.sep) + assert not ExcludeRegex(r"^\\$").match("/") @pytest.mark.skipif(sys.platform in ('darwin',), reason='all but OS X test') @@ -196,31 +239,40 @@ class PatternNonAsciiTestCase(BaseTestCase): pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert not er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testDecomposedUnicode(self): pattern = 'ba\N{COMBINING ACUTE ACCENT}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert not er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testInvalidUnicode(self): pattern = str(b'ba\x80', 'latin1') i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("ba/foo") assert i.match(str(b"ba\x80/foo", 'latin1')) assert not e.match("ba/foo") assert e.match(str(b"ba\x80/foo", 'latin1')) + assert not er.match("ba/foo") + assert er.match(str(b"ba\x80/foo", 'latin1')) @pytest.mark.skipif(sys.platform not in ('darwin',), reason='OS X test') @@ -229,31 +281,40 @@ class OSXPatternNormalizationTestCase(BaseTestCase): pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testDecomposedUnicode(self): pattern = 'ba\N{COMBINING ACUTE ACCENT}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testInvalidUnicode(self): pattern = str(b'ba\x80', 'latin1') i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("ba/foo") assert i.match(str(b"ba\x80/foo", 'latin1')) assert not e.match("ba/foo") assert e.match(str(b"ba\x80/foo", 'latin1')) + assert not er.match("ba/foo") + assert er.match(str(b"ba\x80/foo", 'latin1')) @pytest.mark.parametrize("lines, expected", [ @@ -271,6 +332,17 @@ class OSXPatternNormalizationTestCase(BaseTestCase): "", "# EOF"], ["/more/data", "/home"]), + (["re:.*"], []), + (["re:\s"], ["/data/something00.txt", "/more/data", "/home"]), + ([r"re:(.)(\1)"], ["/more/data", "/home", "/whitespace/at/end of filename \t "]), + (["", "", "", + "# This is a test with mixed pattern styles", + # Case-insensitive pattern + "re:(?i)BAR|ME$", + "", + "*whitespace*", + "fm:*/something00*"], + ["/more/data"]), ]) def test_patterns_from_file(tmpdir, lines, expected): files = [ @@ -291,6 +363,35 @@ def test_patterns_from_file(tmpdir, lines, expected): assert evaluate(str(exclfile)) == (files if expected is None else expected) +@pytest.mark.parametrize("pattern, cls", [ + ("", ExcludePattern), + + # Default style + ("*", ExcludePattern), + ("/data/*", ExcludePattern), + + # fnmatch style + ("fm:", ExcludePattern), + ("fm:*", ExcludePattern), + ("fm:/data/*", ExcludePattern), + ("fm:fm:/data/*", ExcludePattern), + + # Regular expression + ("re:", ExcludeRegex), + ("re:.*", ExcludeRegex), + ("re:^/something/", ExcludeRegex), + ("re:re:^/something/", ExcludeRegex), + ]) +def test_parse_pattern(pattern, cls): + assert isinstance(parse_pattern(pattern), cls) + + +@pytest.mark.parametrize("pattern", ["aa:", "fo:*", "00:", "x1:abc"]) +def test_parse_pattern_error(pattern): + with pytest.raises(ValueError): + parse_pattern(pattern) + + def test_compression_specs(): with pytest.raises(ValueError): CompressionSpec('') diff --git a/docs/usage.rst b/docs/usage.rst index a3f4bfa25..891aed17e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -231,6 +231,11 @@ Examples ~/src \ --exclude '*.pyc' + # Backup home directories excluding image thumbnails (i.e. only + # /home/*/.thumbnails is excluded, not /home/*/*/.thumbnails) + $ borg create /mnt/backup::my-files /home \ + --exclude 're:^/home/[^/]+/\.thumbnails/' + # Backup the root filesystem into an archive named "root-YYYY-MM-DD" # use zlib compression (good, but slow) - default is no compression NAME="root-`date +%Y-%m-%d`"