Add shell-style pattern syntax

The fnmatch module in Python's standard library implements a pattern
format for paths which is similar to shell patterns. However, “*”
matches any character including path separators. This newly introduced
pattern syntax with the selector “sh” no longer matches the path
separator with “*”. Instead “**/” can be used to match zero or more
directory levels.
This commit is contained in:
Michael Hanselmann 2016-01-21 14:24:35 +01:00
parent 382b79212b
commit c7fb598ab9
6 changed files with 268 additions and 11 deletions

View File

@ -642,11 +642,20 @@ class Archiver:
matching any single character specified, including ranges, and '[!...]'
matching any character not specified. For the purpose of these patterns,
the path separator ('\\' for Windows and '/' on other systems) is not
treated specially. For a path to match a pattern, it must completely
match from start to end, or must match from the start to just before
a path separator. Except for the root path, paths will never end in the
path separator when matching is attempted. Thus, if a given pattern ends
in a path separator, a '*' is appended before matching is attempted.
treated specially. Wrap meta-characters in brackets for a literal match
(i.e. `[?]` to match the literal character `?`). For a path to match
a pattern, it must completely match from start to end, or must match from
the start to just before a path separator. Except for the root path,
paths will never end in the path separator when matching is attempted.
Thus, if a given pattern ends in a path separator, a '*' is appended
before matching is attempted.
Shell-style patterns, selector `sh:`
Like fnmatch patterns these are similar to shell patterns. The difference
is that the pattern may include `**/` for matching zero or more directory
levels, `*` for matching zero or more arbitrary characters with the
exception of any path separator.
Regular expressions, selector `re:`
@ -701,6 +710,7 @@ class Archiver:
*.tmp
fm:aa:something/*
re:^/home/[^/]\.tmp/
sh:/home/*/.thumbnails
EOF
$ borg create --exclude-from exclude.txt backup /
''')

View File

@ -30,6 +30,7 @@ from . import __version__ as borg_version
from . import hashindex
from . import chunker
from . import crypto
from . import shellpattern
import msgpack
import msgpack.fallback
@ -332,11 +333,9 @@ class PatternBase:
raise NotImplementedError
# For both PathPrefixPattern and FnmatchPattern, we require that
# the pattern either match the whole path or an initial segment
# of the path up to but not including a path separator. To
# unify the two cases, we add a path separator to the end of
# the path before matching.
# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
# separator to the end of the path before matching.
class PathPrefixPattern(PatternBase):
@ -376,6 +375,27 @@ class FnmatchPattern(PatternBase):
return (self.regex.match(path + os.path.sep) is not None)
class ShellPattern(PatternBase):
"""Shell glob patterns to exclude. A trailing slash means to
exclude the contents of a directory, but not the directory itself.
"""
PREFIX = "sh"
def _prepare(self, pattern):
sep = os.path.sep
if pattern.endswith(sep):
pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
else:
pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
self.pattern = pattern
self.regex = re.compile(shellpattern.translate(self.pattern))
def _match(self, path):
return (self.regex.match(path + os.path.sep) is not None)
class RegexPattern(PatternBase):
"""Regular expression to exclude.
"""
@ -397,6 +417,7 @@ _PATTERN_STYLES = set([
FnmatchPattern,
PathPrefixPattern,
RegexPattern,
ShellPattern,
])
_PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES)

62
borg/shellpattern.py Normal file
View File

@ -0,0 +1,62 @@
import re
import os
def translate(pat):
"""Translate a shell-style pattern to a regular expression.
The pattern may include "**<sep>" (<sep> stands for the platform-specific path separator; "/" on POSIX systems) for
matching zero or more directory levels and "*" for matching zero or more arbitrary characters with the exception of
any path separator. Wrap meta-characters in brackets for a literal match (i.e. "[?]" to match the literal character
"?").
This function is derived from the "fnmatch" module distributed with the Python standard library.
Copyright (C) 2001-2016 Python Software Foundation. All rights reserved.
TODO: support {alt1,alt2} shell-style alternatives
"""
sep = os.path.sep
n = len(pat)
i = 0
res = ""
while i < n:
c = pat[i]
i += 1
if c == "*":
if i + 1 < n and pat[i] == "*" and pat[i + 1] == sep:
# **/ == wildcard for 0+ full (relative) directory names with trailing slashes; the forward slash stands
# for the platform-specific path separator
res += r"(?:[^\%s]*\%s)*" % (sep, sep)
i += 2
else:
# * == wildcard for name parts (does not cross path separator)
res += r"[^\%s]*" % sep
elif c == "?":
# ? == any single character excluding path separator
res += r"[^\%s]" % sep
elif c == "[":
j = i
if j < n and pat[j] == "!":
j += 1
if j < n and pat[j] == "]":
j += 1
while j < n and pat[j] != "]":
j += 1
if j >= n:
res += "\\["
else:
stuff = pat[i:j].replace("\\", "\\\\")
i = j + 1
if stuff[0] == "!":
stuff = "^" + stuff[1:]
elif stuff[0] == "^":
stuff = "\\" + stuff
res += "[%s]" % stuff
else:
res += re.escape(c)
return res + r"\Z(?ms)"

View File

@ -12,7 +12,8 @@ import msgpack.fallback
from ..helpers import Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \
prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, RegexPattern, \
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher, \
ShellPattern
from . import BaseTestCase, environment_variable, FakeInputs
@ -234,6 +235,45 @@ def test_patterns_fnmatch(pattern, expected):
check_patterns(files, FnmatchPattern(pattern), expected)
@pytest.mark.parametrize("pattern, expected", [
# "None" means all files, i.e. all match the given pattern
("*", None),
("**/*", None),
("/**/*", None),
("/./*", None),
("*/*", None),
("*///*", None),
("/home/u", []),
("/home/*",
["/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile", "/home/user2/public_html/index.html",
"/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]),
("/home/user/*", ["/home/user/.profile", "/home/user/.bashrc"]),
("/etc/*/*", ["/etc/server/config", "/etc/server/hosts"]),
("/etc/**/*", ["/etc/server/config", "/etc/server/hosts"]),
("/etc/**/*/*", ["/etc/server/config", "/etc/server/hosts"]),
("*/.pr????e", []),
("**/.pr????e", ["/home/user/.profile", "/home/user2/.profile"]),
("///etc//////*", ["/etc/server/config", "/etc/server/hosts"]),
("/./home//..//home/user2/", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
("/./home//..//home/user2/**/*", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
("/srv*/", ["/srv/messages", "/srv/dmesg", "/srv2/blafasel"]),
("/srv*", ["/srv", "/srv/messages", "/srv/dmesg", "/srv2", "/srv2/blafasel"]),
("/srv/*", ["/srv/messages", "/srv/dmesg"]),
("/srv2/**", ["/srv2", "/srv2/blafasel"]),
("/srv2/**/", ["/srv2/blafasel"]),
("/home/*/.thumbnails", ["/home/foo/.thumbnails"]),
("/home/*/*/.thumbnails", ["/home/foo/bar/.thumbnails"]),
])
def test_patterns_shell(pattern, expected):
files = [
"/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc",
"/home/user2/.profile", "/home/user2/public_html/index.html", "/srv", "/srv/messages", "/srv/dmesg",
"/srv2", "/srv2/blafasel", "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",
]
check_patterns(files, ShellPattern(pattern), expected)
@pytest.mark.parametrize("pattern, expected", [
# "None" means all files, i.e. all match the given pattern
("", None),
@ -276,6 +316,7 @@ def _make_test_patterns(pattern):
return [PathPrefixPattern(pattern),
FnmatchPattern(pattern),
RegexPattern("^{}/foo$".format(pattern)),
ShellPattern(pattern),
]
@ -374,6 +415,12 @@ def test_patterns_from_file(tmpdir, lines, expected):
("pp:/", PathPrefixPattern),
("pp:/data/", PathPrefixPattern),
("pp:pp:/data/", PathPrefixPattern),
# Shell-pattern style
("sh:", ShellPattern),
("sh:*", ShellPattern),
("sh:/data/*", ShellPattern),
("sh:sh:/data/*", ShellPattern),
])
def test_parse_pattern(pattern, cls):
assert isinstance(parse_pattern(pattern), cls)

View File

@ -0,0 +1,113 @@
import re
import pytest
from .. import shellpattern
def check(path, pattern):
compiled = re.compile(shellpattern.translate(pattern))
return bool(compiled.match(path))
@pytest.mark.parametrize("path, patterns", [
# Literal string
("foo/bar", ["foo/bar"]),
("foo\\bar", ["foo\\bar"]),
# Non-ASCII
("foo/c/\u0152/e/bar", ["foo/*/\u0152/*/bar", "*/*/\u0152/*/*", "**/\u0152/*/*"]),
("\u00e4\u00f6\u00dc", ["???", "*", "\u00e4\u00f6\u00dc", "[\u00e4][\u00f6][\u00dc]"]),
# Question mark
("foo", ["fo?"]),
("foo", ["f?o"]),
("foo", ["f??"]),
("foo", ["?oo"]),
("foo", ["?o?"]),
("foo", ["??o"]),
("foo", ["???"]),
# Single asterisk
("", ["*"]),
("foo", ["*", "**", "***"]),
("foo", ["foo*"]),
("foobar", ["foo*"]),
("foobar", ["foo*bar"]),
("foobarbaz", ["foo*baz"]),
("bar", ["*bar"]),
("foobar", ["*bar"]),
("foo/bar", ["foo/*bar"]),
("foo/bar", ["foo/*ar"]),
("foo/bar", ["foo/*r"]),
("foo/bar", ["foo/*"]),
("foo/bar", ["foo*/bar"]),
("foo/bar", ["fo*/bar"]),
("foo/bar", ["f*/bar"]),
("foo/bar", ["*/bar"]),
# Double asterisk (matches 0..n directory layers)
("foo/bar", ["foo/**/bar"]),
("foo/1/bar", ["foo/**/bar"]),
("foo/1/22/333/bar", ["foo/**/bar"]),
("foo/", ["foo/**/"]),
("foo/1/", ["foo/**/"]),
("foo/1/22/333/", ["foo/**/"]),
("bar", ["**/bar"]),
("1/bar", ["**/bar"]),
("1/22/333/bar", ["**/bar"]),
("foo/bar/baz", ["foo/**/*"]),
# Set
("foo1", ["foo[12]"]),
("foo2", ["foo[12]"]),
("foo2/bar", ["foo[12]/*"]),
("f??f", ["f??f", "f[?][?]f"]),
("foo]", ["foo[]]"]),
# Inverted set
("foo3", ["foo[!12]"]),
("foo^", ["foo[^!]"]),
("foo!", ["foo[^!]"]),
])
def test_match(path, patterns):
for p in patterns:
assert check(path, p)
@pytest.mark.parametrize("path, patterns", [
("", ["?", "[]"]),
("foo", ["foo?"]),
("foo", ["?foo"]),
("foo", ["f?oo"]),
# do not match path separator
("foo/ar", ["foo?ar"]),
# do not match/cross over os.path.sep
("foo/bar", ["*"]),
("foo/bar", ["foo*bar"]),
("foo/bar", ["foo*ar"]),
("foo/bar", ["fo*bar"]),
("foo/bar", ["fo*ar"]),
# Double asterisk
("foobar", ["foo/**/bar"]),
# Two asterisks without slash do not match directory separator
("foo/bar", ["**"]),
# Double asterisk not matching filename
("foo/bar", ["**/"]),
# Set
("foo3", ["foo[12]"]),
# Inverted set
("foo1", ["foo[!12]"]),
("foo2", ["foo[!12]"]),
])
def test_mismatch(path, patterns):
for p in patterns:
assert not check(path, p)

View File

@ -240,6 +240,10 @@ Examples
$ borg create /mnt/backup::my-files /home \
--exclude 're:^/home/[^/]+/\.thumbnails/'
# Do the same using a shell-style pattern
$ borg create /mnt/backup::my-files /home \
--exclude 'sh:/home/*/.thumbnails'
# Backup the root filesystem into an archive named "root-YYYY-MM-DD"
# use zlib compression (good, but slow) - default is no compression
NAME="root-`date +%Y-%m-%d`"