From c7fb598ab9d25701fd87dfcb568bd2beafc69616 Mon Sep 17 00:00:00 2001 From: Michael Hanselmann Date: Thu, 21 Jan 2016 14:24:35 +0100 Subject: [PATCH] Add shell-style pattern syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fnmatch module in Python's standard library implements a pattern format for paths which is similar to shell patterns. However, “*” matches any character including path separators. This newly introduced pattern syntax with the selector “sh” no longer matches the path separator with “*”. Instead “**/” can be used to match zero or more directory levels. --- borg/archiver.py | 20 ++++-- borg/helpers.py | 31 +++++++-- borg/shellpattern.py | 62 ++++++++++++++++++ borg/testsuite/helpers.py | 49 +++++++++++++- borg/testsuite/shellpattern.py | 113 +++++++++++++++++++++++++++++++++ docs/usage.rst | 4 ++ 6 files changed, 268 insertions(+), 11 deletions(-) create mode 100644 borg/shellpattern.py create mode 100644 borg/testsuite/shellpattern.py diff --git a/borg/archiver.py b/borg/archiver.py index a076f7fa0..8005d73bc 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -642,11 +642,20 @@ class Archiver: matching any single character specified, including ranges, and '[!...]' matching any character not specified. For the purpose of these patterns, the path separator ('\\' for Windows and '/' on other systems) is not - treated specially. For a path to match a pattern, it must completely - match from start to end, or must match from the start to just before - a path separator. Except for the root path, paths will never end in the - path separator when matching is attempted. Thus, if a given pattern ends - in a path separator, a '*' is appended before matching is attempted. + treated specially. Wrap meta-characters in brackets for a literal match + (i.e. `[?]` to match the literal character `?`). For a path to match + a pattern, it must completely match from start to end, or must match from + the start to just before a path separator. Except for the root path, + paths will never end in the path separator when matching is attempted. + Thus, if a given pattern ends in a path separator, a '*' is appended + before matching is attempted. + + Shell-style patterns, selector `sh:` + + Like fnmatch patterns these are similar to shell patterns. The difference + is that the pattern may include `**/` for matching zero or more directory + levels, `*` for matching zero or more arbitrary characters with the + exception of any path separator. Regular expressions, selector `re:` @@ -701,6 +710,7 @@ class Archiver: *.tmp fm:aa:something/* re:^/home/[^/]\.tmp/ + sh:/home/*/.thumbnails EOF $ borg create --exclude-from exclude.txt backup / ''') diff --git a/borg/helpers.py b/borg/helpers.py index 91c9e0434..764d1dc4a 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -30,6 +30,7 @@ from . import __version__ as borg_version from . import hashindex from . import chunker from . import crypto +from . import shellpattern import msgpack import msgpack.fallback @@ -332,11 +333,9 @@ class PatternBase: raise NotImplementedError -# For both PathPrefixPattern and FnmatchPattern, we require that -# the pattern either match the whole path or an initial segment -# of the path up to but not including a path separator. To -# unify the two cases, we add a path separator to the end of -# the path before matching. +# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path +# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path +# separator to the end of the path before matching. class PathPrefixPattern(PatternBase): @@ -376,6 +375,27 @@ class FnmatchPattern(PatternBase): return (self.regex.match(path + os.path.sep) is not None) +class ShellPattern(PatternBase): + """Shell glob patterns to exclude. A trailing slash means to + exclude the contents of a directory, but not the directory itself. + """ + PREFIX = "sh" + + def _prepare(self, pattern): + sep = os.path.sep + + if pattern.endswith(sep): + pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep + else: + pattern = os.path.normpath(pattern) + sep + "**" + sep + "*" + + self.pattern = pattern + self.regex = re.compile(shellpattern.translate(self.pattern)) + + def _match(self, path): + return (self.regex.match(path + os.path.sep) is not None) + + class RegexPattern(PatternBase): """Regular expression to exclude. """ @@ -397,6 +417,7 @@ _PATTERN_STYLES = set([ FnmatchPattern, PathPrefixPattern, RegexPattern, + ShellPattern, ]) _PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES) diff --git a/borg/shellpattern.py b/borg/shellpattern.py new file mode 100644 index 000000000..7cb8f2114 --- /dev/null +++ b/borg/shellpattern.py @@ -0,0 +1,62 @@ +import re +import os + + +def translate(pat): + """Translate a shell-style pattern to a regular expression. + + The pattern may include "**" ( stands for the platform-specific path separator; "/" on POSIX systems) for + matching zero or more directory levels and "*" for matching zero or more arbitrary characters with the exception of + any path separator. Wrap meta-characters in brackets for a literal match (i.e. "[?]" to match the literal character + "?"). + + This function is derived from the "fnmatch" module distributed with the Python standard library. + + Copyright (C) 2001-2016 Python Software Foundation. All rights reserved. + + TODO: support {alt1,alt2} shell-style alternatives + + """ + sep = os.path.sep + n = len(pat) + i = 0 + res = "" + + while i < n: + c = pat[i] + i += 1 + + if c == "*": + if i + 1 < n and pat[i] == "*" and pat[i + 1] == sep: + # **/ == wildcard for 0+ full (relative) directory names with trailing slashes; the forward slash stands + # for the platform-specific path separator + res += r"(?:[^\%s]*\%s)*" % (sep, sep) + i += 2 + else: + # * == wildcard for name parts (does not cross path separator) + res += r"[^\%s]*" % sep + elif c == "?": + # ? == any single character excluding path separator + res += r"[^\%s]" % sep + elif c == "[": + j = i + if j < n and pat[j] == "!": + j += 1 + if j < n and pat[j] == "]": + j += 1 + while j < n and pat[j] != "]": + j += 1 + if j >= n: + res += "\\[" + else: + stuff = pat[i:j].replace("\\", "\\\\") + i = j + 1 + if stuff[0] == "!": + stuff = "^" + stuff[1:] + elif stuff[0] == "^": + stuff = "\\" + stuff + res += "[%s]" % stuff + else: + res += re.escape(c) + + return res + r"\Z(?ms)" diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index f31bd9840..cc4b3df38 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -12,7 +12,8 @@ import msgpack.fallback from ..helpers import Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \ prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, RegexPattern, \ StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \ - ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher + ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher, \ + ShellPattern from . import BaseTestCase, environment_variable, FakeInputs @@ -234,6 +235,45 @@ def test_patterns_fnmatch(pattern, expected): check_patterns(files, FnmatchPattern(pattern), expected) +@pytest.mark.parametrize("pattern, expected", [ + # "None" means all files, i.e. all match the given pattern + ("*", None), + ("**/*", None), + ("/**/*", None), + ("/./*", None), + ("*/*", None), + ("*///*", None), + ("/home/u", []), + ("/home/*", + ["/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile", "/home/user2/public_html/index.html", + "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]), + ("/home/user/*", ["/home/user/.profile", "/home/user/.bashrc"]), + ("/etc/*/*", ["/etc/server/config", "/etc/server/hosts"]), + ("/etc/**/*", ["/etc/server/config", "/etc/server/hosts"]), + ("/etc/**/*/*", ["/etc/server/config", "/etc/server/hosts"]), + ("*/.pr????e", []), + ("**/.pr????e", ["/home/user/.profile", "/home/user2/.profile"]), + ("///etc//////*", ["/etc/server/config", "/etc/server/hosts"]), + ("/./home//..//home/user2/", ["/home/user2/.profile", "/home/user2/public_html/index.html"]), + ("/./home//..//home/user2/**/*", ["/home/user2/.profile", "/home/user2/public_html/index.html"]), + ("/srv*/", ["/srv/messages", "/srv/dmesg", "/srv2/blafasel"]), + ("/srv*", ["/srv", "/srv/messages", "/srv/dmesg", "/srv2", "/srv2/blafasel"]), + ("/srv/*", ["/srv/messages", "/srv/dmesg"]), + ("/srv2/**", ["/srv2", "/srv2/blafasel"]), + ("/srv2/**/", ["/srv2/blafasel"]), + ("/home/*/.thumbnails", ["/home/foo/.thumbnails"]), + ("/home/*/*/.thumbnails", ["/home/foo/bar/.thumbnails"]), + ]) +def test_patterns_shell(pattern, expected): + files = [ + "/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc", + "/home/user2/.profile", "/home/user2/public_html/index.html", "/srv", "/srv/messages", "/srv/dmesg", + "/srv2", "/srv2/blafasel", "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails", + ] + + check_patterns(files, ShellPattern(pattern), expected) + + @pytest.mark.parametrize("pattern, expected", [ # "None" means all files, i.e. all match the given pattern ("", None), @@ -276,6 +316,7 @@ def _make_test_patterns(pattern): return [PathPrefixPattern(pattern), FnmatchPattern(pattern), RegexPattern("^{}/foo$".format(pattern)), + ShellPattern(pattern), ] @@ -374,6 +415,12 @@ def test_patterns_from_file(tmpdir, lines, expected): ("pp:/", PathPrefixPattern), ("pp:/data/", PathPrefixPattern), ("pp:pp:/data/", PathPrefixPattern), + + # Shell-pattern style + ("sh:", ShellPattern), + ("sh:*", ShellPattern), + ("sh:/data/*", ShellPattern), + ("sh:sh:/data/*", ShellPattern), ]) def test_parse_pattern(pattern, cls): assert isinstance(parse_pattern(pattern), cls) diff --git a/borg/testsuite/shellpattern.py b/borg/testsuite/shellpattern.py new file mode 100644 index 000000000..fae8c75d1 --- /dev/null +++ b/borg/testsuite/shellpattern.py @@ -0,0 +1,113 @@ +import re + +import pytest + +from .. import shellpattern + + +def check(path, pattern): + compiled = re.compile(shellpattern.translate(pattern)) + + return bool(compiled.match(path)) + + +@pytest.mark.parametrize("path, patterns", [ + # Literal string + ("foo/bar", ["foo/bar"]), + ("foo\\bar", ["foo\\bar"]), + + # Non-ASCII + ("foo/c/\u0152/e/bar", ["foo/*/\u0152/*/bar", "*/*/\u0152/*/*", "**/\u0152/*/*"]), + ("\u00e4\u00f6\u00dc", ["???", "*", "\u00e4\u00f6\u00dc", "[\u00e4][\u00f6][\u00dc]"]), + + # Question mark + ("foo", ["fo?"]), + ("foo", ["f?o"]), + ("foo", ["f??"]), + ("foo", ["?oo"]), + ("foo", ["?o?"]), + ("foo", ["??o"]), + ("foo", ["???"]), + + # Single asterisk + ("", ["*"]), + ("foo", ["*", "**", "***"]), + ("foo", ["foo*"]), + ("foobar", ["foo*"]), + ("foobar", ["foo*bar"]), + ("foobarbaz", ["foo*baz"]), + ("bar", ["*bar"]), + ("foobar", ["*bar"]), + ("foo/bar", ["foo/*bar"]), + ("foo/bar", ["foo/*ar"]), + ("foo/bar", ["foo/*r"]), + ("foo/bar", ["foo/*"]), + ("foo/bar", ["foo*/bar"]), + ("foo/bar", ["fo*/bar"]), + ("foo/bar", ["f*/bar"]), + ("foo/bar", ["*/bar"]), + + # Double asterisk (matches 0..n directory layers) + ("foo/bar", ["foo/**/bar"]), + ("foo/1/bar", ["foo/**/bar"]), + ("foo/1/22/333/bar", ["foo/**/bar"]), + ("foo/", ["foo/**/"]), + ("foo/1/", ["foo/**/"]), + ("foo/1/22/333/", ["foo/**/"]), + ("bar", ["**/bar"]), + ("1/bar", ["**/bar"]), + ("1/22/333/bar", ["**/bar"]), + ("foo/bar/baz", ["foo/**/*"]), + + # Set + ("foo1", ["foo[12]"]), + ("foo2", ["foo[12]"]), + ("foo2/bar", ["foo[12]/*"]), + ("f??f", ["f??f", "f[?][?]f"]), + ("foo]", ["foo[]]"]), + + # Inverted set + ("foo3", ["foo[!12]"]), + ("foo^", ["foo[^!]"]), + ("foo!", ["foo[^!]"]), + ]) +def test_match(path, patterns): + for p in patterns: + assert check(path, p) + + +@pytest.mark.parametrize("path, patterns", [ + ("", ["?", "[]"]), + ("foo", ["foo?"]), + ("foo", ["?foo"]), + ("foo", ["f?oo"]), + + # do not match path separator + ("foo/ar", ["foo?ar"]), + + # do not match/cross over os.path.sep + ("foo/bar", ["*"]), + ("foo/bar", ["foo*bar"]), + ("foo/bar", ["foo*ar"]), + ("foo/bar", ["fo*bar"]), + ("foo/bar", ["fo*ar"]), + + # Double asterisk + ("foobar", ["foo/**/bar"]), + + # Two asterisks without slash do not match directory separator + ("foo/bar", ["**"]), + + # Double asterisk not matching filename + ("foo/bar", ["**/"]), + + # Set + ("foo3", ["foo[12]"]), + + # Inverted set + ("foo1", ["foo[!12]"]), + ("foo2", ["foo[!12]"]), + ]) +def test_mismatch(path, patterns): + for p in patterns: + assert not check(path, p) diff --git a/docs/usage.rst b/docs/usage.rst index e3a9ed7ba..894b1d960 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -240,6 +240,10 @@ Examples $ borg create /mnt/backup::my-files /home \ --exclude 're:^/home/[^/]+/\.thumbnails/' + # Do the same using a shell-style pattern + $ borg create /mnt/backup::my-files /home \ + --exclude 'sh:/home/*/.thumbnails' + # Backup the root filesystem into an archive named "root-YYYY-MM-DD" # use zlib compression (good, but slow) - default is no compression NAME="root-`date +%Y-%m-%d`"