Merge pull request #584 from hansmi/shell-pattern

Add shell-style pattern syntax
This commit is contained in:
TW 2016-01-22 20:14:37 +01:00
commit 068c68b24c
6 changed files with 290 additions and 30 deletions

View File

@ -628,38 +628,50 @@ class Archiver:
helptext = {}
helptext['patterns'] = textwrap.dedent('''
Exclusion patterns support three separate styles, fnmatch, regular
Exclusion patterns support four separate styles, fnmatch, shell, regular
expressions and path prefixes. If followed by a colon (':') the first two
characters of a pattern are used as a style selector. Explicit style
selection is necessary when a non-default style is desired or when the
desired pattern starts with two alphanumeric characters followed by a colon
(i.e. `aa:something/*`).
`Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_ patterns use
a variant of shell pattern syntax, with '*' matching any number of
characters, '?' matching any single character, '[...]' matching any single
character specified, including ranges, and '[!...]' matching any character
not specified. The style selector is `fm`. For the purpose of these patterns,
the path separator ('\\' for Windows and '/' on other systems) is not treated
specially. For a path to match a pattern, it must completely match from start
to end, or must match from the start to just before a path separator. Except
for the root path, paths will never end in the path separator when matching
is attempted. Thus, if a given pattern ends in a path separator, a '*' is
appended before matching is attempted.
`Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_, selector `fm:`
Regular expressions similar to those found in Perl are supported with the
selection prefix `re:`. Unlike shell patterns regular expressions are not
required to match the complete path and any substring match is sufficient. It
is strongly recommended to anchor patterns to the start ('^'), to the end
('$') or both. Path separators ('\\' for Windows and '/' on other systems) in
paths are always normalized to a forward slash ('/') before applying
a pattern. The regular expression syntax is described in the `Python
documentation for the re module
<https://docs.python.org/3/library/re.html>`_.
These patterns use a variant of shell pattern syntax, with '*' matching
any number of characters, '?' matching any single character, '[...]'
matching any single character specified, including ranges, and '[!...]'
matching any character not specified. For the purpose of these patterns,
the path separator ('\\' for Windows and '/' on other systems) is not
treated specially. Wrap meta-characters in brackets for a literal match
(i.e. `[?]` to match the literal character `?`). For a path to match
a pattern, it must completely match from start to end, or must match from
the start to just before a path separator. Except for the root path,
paths will never end in the path separator when matching is attempted.
Thus, if a given pattern ends in a path separator, a '*' is appended
before matching is attempted.
Prefix path patterns can be selected with the prefix `pp:`. This pattern
style is useful to match whole sub-directories. The pattern `pp:/data/bar`
matches `/data/bar` and everything therein.
Shell-style patterns, selector `sh:`
Like fnmatch patterns these are similar to shell patterns. The difference
is that the pattern may include `**/` for matching zero or more directory
levels, `*` for matching zero or more arbitrary characters with the
exception of any path separator.
Regular expressions, selector `re:`
Regular expressions similar to those found in Perl are supported. Unlike
shell patterns regular expressions are not required to match the complete
path and any substring match is sufficient. It is strongly recommended to
anchor patterns to the start ('^'), to the end ('$') or both. Path
separators ('\\' for Windows and '/' on other systems) in paths are
always normalized to a forward slash ('/') before applying a pattern. The
regular expression syntax is described in the `Python documentation for
the re module <https://docs.python.org/3/library/re.html>`_.
Prefix path, selector `pp:`
This pattern style is useful to match whole sub-directories. The pattern
`pp:/data/bar` matches `/data/bar` and everything therein.
Exclusions can be passed via the command line option `--exclude`. When used
from within a shell the patterns should be quoted to protect them from
@ -698,6 +710,7 @@ class Archiver:
*.tmp
fm:aa:something/*
re:^/home/[^/]\.tmp/
sh:/home/*/.thumbnails
EOF
$ borg create --exclude-from exclude.txt backup /
''')

View File

@ -30,6 +30,7 @@ from . import __version__ as borg_version
from . import hashindex
from . import chunker
from . import crypto
from . import shellpattern
import msgpack
import msgpack.fallback
@ -332,11 +333,9 @@ class PatternBase:
raise NotImplementedError
# For both PathPrefixPattern and FnmatchPattern, we require that
# the pattern either match the whole path or an initial segment
# of the path up to but not including a path separator. To
# unify the two cases, we add a path separator to the end of
# the path before matching.
# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
# separator to the end of the path before matching.
class PathPrefixPattern(PatternBase):
@ -376,6 +375,27 @@ class FnmatchPattern(PatternBase):
return (self.regex.match(path + os.path.sep) is not None)
class ShellPattern(PatternBase):
"""Shell glob patterns to exclude. A trailing slash means to
exclude the contents of a directory, but not the directory itself.
"""
PREFIX = "sh"
def _prepare(self, pattern):
sep = os.path.sep
if pattern.endswith(sep):
pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
else:
pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
self.pattern = pattern
self.regex = re.compile(shellpattern.translate(self.pattern))
def _match(self, path):
return (self.regex.match(path + os.path.sep) is not None)
class RegexPattern(PatternBase):
"""Regular expression to exclude.
"""
@ -397,6 +417,7 @@ _PATTERN_STYLES = set([
FnmatchPattern,
PathPrefixPattern,
RegexPattern,
ShellPattern,
])
_PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES)

62
borg/shellpattern.py Normal file
View File

@ -0,0 +1,62 @@
import re
import os
def translate(pat):
"""Translate a shell-style pattern to a regular expression.
The pattern may include "**<sep>" (<sep> stands for the platform-specific path separator; "/" on POSIX systems) for
matching zero or more directory levels and "*" for matching zero or more arbitrary characters with the exception of
any path separator. Wrap meta-characters in brackets for a literal match (i.e. "[?]" to match the literal character
"?").
This function is derived from the "fnmatch" module distributed with the Python standard library.
Copyright (C) 2001-2016 Python Software Foundation. All rights reserved.
TODO: support {alt1,alt2} shell-style alternatives
"""
sep = os.path.sep
n = len(pat)
i = 0
res = ""
while i < n:
c = pat[i]
i += 1
if c == "*":
if i + 1 < n and pat[i] == "*" and pat[i + 1] == sep:
# **/ == wildcard for 0+ full (relative) directory names with trailing slashes; the forward slash stands
# for the platform-specific path separator
res += r"(?:[^\%s]*\%s)*" % (sep, sep)
i += 2
else:
# * == wildcard for name parts (does not cross path separator)
res += r"[^\%s]*" % sep
elif c == "?":
# ? == any single character excluding path separator
res += r"[^\%s]" % sep
elif c == "[":
j = i
if j < n and pat[j] == "!":
j += 1
if j < n and pat[j] == "]":
j += 1
while j < n and pat[j] != "]":
j += 1
if j >= n:
res += "\\["
else:
stuff = pat[i:j].replace("\\", "\\\\")
i = j + 1
if stuff[0] == "!":
stuff = "^" + stuff[1:]
elif stuff[0] == "^":
stuff = "\\" + stuff
res += "[%s]" % stuff
else:
res += re.escape(c)
return res + r"\Z(?ms)"

View File

@ -12,7 +12,8 @@ import msgpack.fallback
from ..helpers import Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \
prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, RegexPattern, \
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher, \
ShellPattern
from . import BaseTestCase, environment_variable, FakeInputs
@ -234,6 +235,45 @@ def test_patterns_fnmatch(pattern, expected):
check_patterns(files, FnmatchPattern(pattern), expected)
@pytest.mark.parametrize("pattern, expected", [
# "None" means all files, i.e. all match the given pattern
("*", None),
("**/*", None),
("/**/*", None),
("/./*", None),
("*/*", None),
("*///*", None),
("/home/u", []),
("/home/*",
["/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile", "/home/user2/public_html/index.html",
"/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]),
("/home/user/*", ["/home/user/.profile", "/home/user/.bashrc"]),
("/etc/*/*", ["/etc/server/config", "/etc/server/hosts"]),
("/etc/**/*", ["/etc/server/config", "/etc/server/hosts"]),
("/etc/**/*/*", ["/etc/server/config", "/etc/server/hosts"]),
("*/.pr????e", []),
("**/.pr????e", ["/home/user/.profile", "/home/user2/.profile"]),
("///etc//////*", ["/etc/server/config", "/etc/server/hosts"]),
("/./home//..//home/user2/", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
("/./home//..//home/user2/**/*", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
("/srv*/", ["/srv/messages", "/srv/dmesg", "/srv2/blafasel"]),
("/srv*", ["/srv", "/srv/messages", "/srv/dmesg", "/srv2", "/srv2/blafasel"]),
("/srv/*", ["/srv/messages", "/srv/dmesg"]),
("/srv2/**", ["/srv2", "/srv2/blafasel"]),
("/srv2/**/", ["/srv2/blafasel"]),
("/home/*/.thumbnails", ["/home/foo/.thumbnails"]),
("/home/*/*/.thumbnails", ["/home/foo/bar/.thumbnails"]),
])
def test_patterns_shell(pattern, expected):
files = [
"/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc",
"/home/user2/.profile", "/home/user2/public_html/index.html", "/srv", "/srv/messages", "/srv/dmesg",
"/srv2", "/srv2/blafasel", "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",
]
check_patterns(files, ShellPattern(pattern), expected)
@pytest.mark.parametrize("pattern, expected", [
# "None" means all files, i.e. all match the given pattern
("", None),
@ -276,6 +316,7 @@ def _make_test_patterns(pattern):
return [PathPrefixPattern(pattern),
FnmatchPattern(pattern),
RegexPattern("^{}/foo$".format(pattern)),
ShellPattern(pattern),
]
@ -374,6 +415,12 @@ def test_patterns_from_file(tmpdir, lines, expected):
("pp:/", PathPrefixPattern),
("pp:/data/", PathPrefixPattern),
("pp:pp:/data/", PathPrefixPattern),
# Shell-pattern style
("sh:", ShellPattern),
("sh:*", ShellPattern),
("sh:/data/*", ShellPattern),
("sh:sh:/data/*", ShellPattern),
])
def test_parse_pattern(pattern, cls):
assert isinstance(parse_pattern(pattern), cls)

View File

@ -0,0 +1,113 @@
import re
import pytest
from .. import shellpattern
def check(path, pattern):
compiled = re.compile(shellpattern.translate(pattern))
return bool(compiled.match(path))
@pytest.mark.parametrize("path, patterns", [
# Literal string
("foo/bar", ["foo/bar"]),
("foo\\bar", ["foo\\bar"]),
# Non-ASCII
("foo/c/\u0152/e/bar", ["foo/*/\u0152/*/bar", "*/*/\u0152/*/*", "**/\u0152/*/*"]),
("\u00e4\u00f6\u00dc", ["???", "*", "\u00e4\u00f6\u00dc", "[\u00e4][\u00f6][\u00dc]"]),
# Question mark
("foo", ["fo?"]),
("foo", ["f?o"]),
("foo", ["f??"]),
("foo", ["?oo"]),
("foo", ["?o?"]),
("foo", ["??o"]),
("foo", ["???"]),
# Single asterisk
("", ["*"]),
("foo", ["*", "**", "***"]),
("foo", ["foo*"]),
("foobar", ["foo*"]),
("foobar", ["foo*bar"]),
("foobarbaz", ["foo*baz"]),
("bar", ["*bar"]),
("foobar", ["*bar"]),
("foo/bar", ["foo/*bar"]),
("foo/bar", ["foo/*ar"]),
("foo/bar", ["foo/*r"]),
("foo/bar", ["foo/*"]),
("foo/bar", ["foo*/bar"]),
("foo/bar", ["fo*/bar"]),
("foo/bar", ["f*/bar"]),
("foo/bar", ["*/bar"]),
# Double asterisk (matches 0..n directory layers)
("foo/bar", ["foo/**/bar"]),
("foo/1/bar", ["foo/**/bar"]),
("foo/1/22/333/bar", ["foo/**/bar"]),
("foo/", ["foo/**/"]),
("foo/1/", ["foo/**/"]),
("foo/1/22/333/", ["foo/**/"]),
("bar", ["**/bar"]),
("1/bar", ["**/bar"]),
("1/22/333/bar", ["**/bar"]),
("foo/bar/baz", ["foo/**/*"]),
# Set
("foo1", ["foo[12]"]),
("foo2", ["foo[12]"]),
("foo2/bar", ["foo[12]/*"]),
("f??f", ["f??f", "f[?][?]f"]),
("foo]", ["foo[]]"]),
# Inverted set
("foo3", ["foo[!12]"]),
("foo^", ["foo[^!]"]),
("foo!", ["foo[^!]"]),
])
def test_match(path, patterns):
for p in patterns:
assert check(path, p)
@pytest.mark.parametrize("path, patterns", [
("", ["?", "[]"]),
("foo", ["foo?"]),
("foo", ["?foo"]),
("foo", ["f?oo"]),
# do not match path separator
("foo/ar", ["foo?ar"]),
# do not match/cross over os.path.sep
("foo/bar", ["*"]),
("foo/bar", ["foo*bar"]),
("foo/bar", ["foo*ar"]),
("foo/bar", ["fo*bar"]),
("foo/bar", ["fo*ar"]),
# Double asterisk
("foobar", ["foo/**/bar"]),
# Two asterisks without slash do not match directory separator
("foo/bar", ["**"]),
# Double asterisk not matching filename
("foo/bar", ["**/"]),
# Set
("foo3", ["foo[12]"]),
# Inverted set
("foo1", ["foo[!12]"]),
("foo2", ["foo[!12]"]),
])
def test_mismatch(path, patterns):
for p in patterns:
assert not check(path, p)

View File

@ -240,6 +240,10 @@ Examples
$ borg create /mnt/backup::my-files /home \
--exclude 're:^/home/[^/]+/\.thumbnails/'
# Do the same using a shell-style pattern
$ borg create /mnt/backup::my-files /home \
--exclude 'sh:/home/*/.thumbnails'
# Backup the root filesystem into an archive named "root-YYYY-MM-DD"
# use zlib compression (good, but slow) - default is no compression
NAME="root-`date +%Y-%m-%d`"