diff --git a/borg/archiver.py b/borg/archiver.py index e3abe1026..beee16052 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -17,7 +17,7 @@ import traceback from . import __version__ from .helpers import Error, location_validator, format_time, format_file_size, \ - format_file_mode, ExcludePattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ + format_file_mode, parse_pattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ get_cache_dir, get_keys_dir, prune_within, prune_split, unhexlify, \ Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \ dir_is_tagged, bigint_to_int, ChunkerParams, CompressionSpec, is_slow_msgpack, yes, sysinfo, \ @@ -598,17 +598,43 @@ class Archiver: helptext = {} helptext['patterns'] = textwrap.dedent(''' - Exclude patterns use a variant of shell pattern syntax, with '*' matching any - number of characters, '?' matching any single character, '[...]' matching any - single character specified, including ranges, and '[!...]' matching any - character not specified. For the purpose of these patterns, the path - separator ('\\' for Windows and '/' on other systems) is not treated - specially. For a path to match a pattern, it must completely match from - start to end, or must match from the start to just before a path separator. - Except for the root path, paths will never end in the path separator when - matching is attempted. Thus, if a given pattern ends in a path separator, a - '*' is appended before matching is attempted. Patterns with wildcards should - be quoted to protect them from shell expansion. + Exclusion patterns support two separate styles, fnmatch and regular + expressions. If followed by a colon (':') the first two characters of + a pattern are used as a style selector. Explicit style selection is necessary + when regular expressions are desired or when the desired fnmatch pattern + starts with two alphanumeric characters followed by a colon (i.e. + `aa:something/*`). + + `Fnmatch `_ patterns use + a variant of shell pattern syntax, with '*' matching any number of + characters, '?' matching any single character, '[...]' matching any single + character specified, including ranges, and '[!...]' matching any character + not specified. The style selector is `fm`. For the purpose of these patterns, + the path separator ('\\' for Windows and '/' on other systems) is not treated + specially. For a path to match a pattern, it must completely match from start + to end, or must match from the start to just before a path separator. Except + for the root path, paths will never end in the path separator when matching + is attempted. Thus, if a given pattern ends in a path separator, a '*' is + appended before matching is attempted. + + Regular expressions similar to those found in Perl are supported with the + selection prefix `re:`. Unlike shell patterns regular expressions are not + required to match the complete path and any substring match is sufficient. It + is strongly recommended to anchor patterns to the start ('^'), to the end + ('$') or both. Path separators ('\\' for Windows and '/' on other systems) in + paths are always normalized to a forward slash ('/') before applying + a pattern. The regular expression syntax is described in the `Python + documentation for the re module + `_. + + Exclusions can be passed via the command line option `--exclude`. When used + from within a shell the patterns should be quoted to protect them from + expansion. + + The `--exclude-from` option permits loading exclusion patterns from a text + file with one pattern per line. Empty lines as well as lines starting with + the number sign ('#') are ignored. The optional style selector prefix is + also supported for patterns loaded from a file. Examples: @@ -624,6 +650,20 @@ class Archiver: # The file '/home/user/cache/important' is *not* backed up: $ borg create -e /home/user/cache/ backup / /home/user/cache/important + + # The contents of directories in '/home' are not backed up when their name + # ends in '.tmp' + $ borg create --exclude 're:^/home/[^/]+\.tmp/' backup / + + # Load exclusions from file + $ cat >exclude.txt < 2 and pattern[2] == ":" and pattern[:2].isalnum(): + (style, pattern) = (pattern[:2], pattern[3:]) + else: + style = _DEFAULT_PATTERN_STYLE + + cls = _PATTERN_STYLES.get(style, None) + + if cls is None: + raise ValueError("Unknown pattern style: {}".format(style)) + + return cls(pattern) + + def timestamp(s): """Convert a --timestamp=s argument to a datetime object""" try: diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index adb05e101..58d20c528 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -489,6 +489,79 @@ class ArchiverTestCase(ArchiverTestCaseBase): self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file3']) + def test_extract_include_exclude_regex(self): + self.cmd('init', self.repository_location) + self.create_regular_file('file1', size=1024 * 80) + self.create_regular_file('file2', size=1024 * 80) + self.create_regular_file('file3', size=1024 * 80) + self.create_regular_file('file4', size=1024 * 80) + self.create_regular_file('file333', size=1024 * 80) + + # Create with regular expression exclusion for file4 + self.cmd('create', '--exclude=re:input/file4$', self.repository_location + '::test', 'input') + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333']) + shutil.rmtree('output/input') + + # Extract with regular expression exclusion + with changedir('output'): + self.cmd('extract', '--exclude=re:file3+', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2']) + shutil.rmtree('output/input') + + # Combine --exclude with fnmatch and regular expression + with changedir('output'): + self.cmd('extract', '--exclude=input/file2', '--exclude=re:file[01]', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3', 'file333']) + shutil.rmtree('output/input') + + # Combine --exclude-from and regular expression exclusion + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, '--exclude=re:file1', + '--exclude=re:file(\\d)\\1\\1$', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3']) + + def test_extract_include_exclude_regex_from_file(self): + self.cmd('init', self.repository_location) + self.create_regular_file('file1', size=1024 * 80) + self.create_regular_file('file2', size=1024 * 80) + self.create_regular_file('file3', size=1024 * 80) + self.create_regular_file('file4', size=1024 * 80) + self.create_regular_file('file333', size=1024 * 80) + self.create_regular_file('aa:something', size=1024 * 80) + + # Create while excluding using mixed pattern styles + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:input/file4$\n') + fd.write(b'fm:*aa:*thing\n') + + self.cmd('create', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test', 'input') + with changedir('output'): + self.cmd('extract', self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333']) + shutil.rmtree('output/input') + + # Exclude using regular expression + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:file3+\n') + + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2']) + shutil.rmtree('output/input') + + # Mixed exclude pattern styles + with open(self.exclude_file_path, 'wb') as fd: + fd.write(b're:file(\\d)\\1\\1$\n') + fd.write(b'fm:nothingwillmatchthis\n') + fd.write(b'*/file1\n') + fd.write(b're:file2$\n') + + with changedir('output'): + self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test') + self.assert_equal(sorted(os.listdir('output/input')), ['file3']) + def test_exclude_caches(self): self.cmd('init', self.repository_location) self.create_regular_file('file1', size=1024 * 80) diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index e766ed441..a61bdd28e 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -10,9 +10,9 @@ import msgpack import msgpack.fallback from ..helpers import adjust_patterns, exclude_path, Location, format_file_size, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \ - prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, \ + prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, ExcludeRegex, \ StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \ - ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes + ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern from . import BaseTestCase, environment_variable, FakeInputs @@ -160,6 +160,15 @@ class FormatTimedeltaTestCase(BaseTestCase): ) +def check_patterns(files, paths, excludes, expected): + """Utility for testing exclusion patterns. + """ + patterns = adjust_patterns(paths, excludes) + included = [path for path in files if not exclude_path(path, patterns)] + + assert included == (files if expected is None else expected) + + @pytest.mark.parametrize("paths, excludes, expected", [ # "None" means all files, i.e. none excluded ([], [], None), @@ -184,10 +193,44 @@ def test_patterns(paths, excludes, expected): '/var/log/messages', '/var/log/dmesg', ] - patterns = adjust_patterns(paths, [ExcludePattern(p) for p in excludes]) - included = [path for path in files if not exclude_path(path, patterns)] + check_patterns(files, paths, [ExcludePattern(p) for p in excludes], expected) - assert included == (files if expected is None else expected) + +@pytest.mark.parametrize("paths, excludes, expected", [ + # "None" means all files, i.e. none excluded + ([], [], None), + (['/'], [], None), + (['/'], ['.*'], []), + (['/'], ['^/'], []), + (['/'], ['^abc$'], None), + (['/'], ['^(?!/home/)'], + ['/home/user/.profile', '/home/user/.bashrc', '/home/user2/.profile', + '/home/user2/public_html/index.html']), + ]) +def test_patterns_regex(paths, excludes, expected): + files = [ + '/srv/data', '/foo/bar', '/home', + '/home/user/.profile', '/home/user/.bashrc', + '/home/user2/.profile', '/home/user2/public_html/index.html', + '/opt/log/messages.txt', '/opt/log/dmesg.txt', + ] + + patterns = [] + + for i in excludes: + pat = ExcludeRegex(i) + assert str(pat) == i + assert pat.pattern == i + patterns.append(pat) + + check_patterns(files, paths, patterns, expected) + + +def test_regex_pattern(): + # The forward slash must match the platform-specific path separator + assert ExcludeRegex("^/$").match("/") + assert ExcludeRegex("^/$").match(os.path.sep) + assert not ExcludeRegex(r"^\\$").match("/") @pytest.mark.skipif(sys.platform in ('darwin',), reason='all but OS X test') @@ -196,31 +239,40 @@ class PatternNonAsciiTestCase(BaseTestCase): pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert not er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testDecomposedUnicode(self): pattern = 'ba\N{COMBINING ACUTE ACCENT}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert not er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testInvalidUnicode(self): pattern = str(b'ba\x80', 'latin1') i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("ba/foo") assert i.match(str(b"ba\x80/foo", 'latin1')) assert not e.match("ba/foo") assert e.match(str(b"ba\x80/foo", 'latin1')) + assert not er.match("ba/foo") + assert er.match(str(b"ba\x80/foo", 'latin1')) @pytest.mark.skipif(sys.platform not in ('darwin',), reason='OS X test') @@ -229,31 +281,40 @@ class OSXPatternNormalizationTestCase(BaseTestCase): pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testDecomposedUnicode(self): pattern = 'ba\N{COMBINING ACUTE ACCENT}' i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo") def testInvalidUnicode(self): pattern = str(b'ba\x80', 'latin1') i = IncludePattern(pattern) e = ExcludePattern(pattern) + er = ExcludeRegex("^{}/foo$".format(pattern)) assert not i.match("ba/foo") assert i.match(str(b"ba\x80/foo", 'latin1')) assert not e.match("ba/foo") assert e.match(str(b"ba\x80/foo", 'latin1')) + assert not er.match("ba/foo") + assert er.match(str(b"ba\x80/foo", 'latin1')) @pytest.mark.parametrize("lines, expected", [ @@ -271,6 +332,17 @@ class OSXPatternNormalizationTestCase(BaseTestCase): "", "# EOF"], ["/more/data", "/home"]), + (["re:.*"], []), + (["re:\s"], ["/data/something00.txt", "/more/data", "/home"]), + ([r"re:(.)(\1)"], ["/more/data", "/home", "/whitespace/at/end of filename \t "]), + (["", "", "", + "# This is a test with mixed pattern styles", + # Case-insensitive pattern + "re:(?i)BAR|ME$", + "", + "*whitespace*", + "fm:*/something00*"], + ["/more/data"]), ]) def test_patterns_from_file(tmpdir, lines, expected): files = [ @@ -291,6 +363,35 @@ def test_patterns_from_file(tmpdir, lines, expected): assert evaluate(str(exclfile)) == (files if expected is None else expected) +@pytest.mark.parametrize("pattern, cls", [ + ("", ExcludePattern), + + # Default style + ("*", ExcludePattern), + ("/data/*", ExcludePattern), + + # fnmatch style + ("fm:", ExcludePattern), + ("fm:*", ExcludePattern), + ("fm:/data/*", ExcludePattern), + ("fm:fm:/data/*", ExcludePattern), + + # Regular expression + ("re:", ExcludeRegex), + ("re:.*", ExcludeRegex), + ("re:^/something/", ExcludeRegex), + ("re:re:^/something/", ExcludeRegex), + ]) +def test_parse_pattern(pattern, cls): + assert isinstance(parse_pattern(pattern), cls) + + +@pytest.mark.parametrize("pattern", ["aa:", "fo:*", "00:", "x1:abc"]) +def test_parse_pattern_error(pattern): + with pytest.raises(ValueError): + parse_pattern(pattern) + + def test_compression_specs(): with pytest.raises(ValueError): CompressionSpec('') diff --git a/docs/usage.rst b/docs/usage.rst index a3f4bfa25..891aed17e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -231,6 +231,11 @@ Examples ~/src \ --exclude '*.pyc' + # Backup home directories excluding image thumbnails (i.e. only + # /home/*/.thumbnails is excluded, not /home/*/*/.thumbnails) + $ borg create /mnt/backup::my-files /home \ + --exclude 're:^/home/[^/]+/\.thumbnails/' + # Backup the root filesystem into an archive named "root-YYYY-MM-DD" # use zlib compression (good, but slow) - default is no compression NAME="root-`date +%Y-%m-%d`"