From 2bafece093b6c261f5124f59e9f1c2e53c68bd61 Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <public@hansmi.ch>
Date: Wed, 16 Dec 2015 00:14:02 +0100
Subject: [PATCH] Implement exclusions using regular expressions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing option to exclude files and directories, “--exclude”, is
implemented using fnmatch[1]. fnmatch matches the slash (“/”) with “*”
and thus makes it impossible to write patterns where a directory with
a given name should be excluded at a specific depth in the directory
hierarchy, but not anywhere else. Consider this structure:

  home/
  home/aaa
  home/aaa/.thumbnails
  home/user
  home/user/img
  home/user/img/.thumbnails

fnmatch incorrectly excludes “home/user/img/.thumbnails” with a pattern
of “home/*/.thumbnails” when the intention is to exclude “.thumbnails”
in all home directories while retaining directories with the same name
in all other locations.

With this change regular expressions are introduced as an additional
pattern syntax. The syntax is selected using a prefix on “--exclude”'s
value. “re:” is for regular expression and “fm:”, the default, selects
fnmatch. Selecting the syntax is necessary when regular expressions are
desired or when the desired fnmatch pattern starts with two alphanumeric
characters followed by a colon (i.e. “aa:something/*”). The exclusion
described above can be implemented as follows:

  --exclude 're:^home/[^/]+/\.thumbnails$'

The “--exclude-from” option permits loading exclusions from a text file
where the same prefixes can now be used, e.g. “re:\.tmp$”.

The documentation has been extended and now not only describes the two
pattern styles, but also the file format supported by “--exclude-from”.

This change has been discussed in issue #43 and in change request #497.

[1] https://docs.python.org/3/library/fnmatch.html

Signed-off-by: Michael Hanselmann <public@hansmi.ch>
---
 borg/archiver.py           |  68 ++++++++++++++++++-----
 borg/helpers.py            |  42 +++++++++++++-
 borg/testsuite/archiver.py |  73 ++++++++++++++++++++++++
 borg/testsuite/helpers.py  | 111 +++++++++++++++++++++++++++++++++++--
 docs/usage.rst             |   5 ++
 5 files changed, 278 insertions(+), 21 deletions(-)

diff --git a/borg/archiver.py b/borg/archiver.py
index e3abe1026..beee16052 100644
--- a/borg/archiver.py
+++ b/borg/archiver.py
@@ -17,7 +17,7 @@ import traceback
 
 from . import __version__
 from .helpers import Error, location_validator, format_time, format_file_size, \
-    format_file_mode, ExcludePattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
+    format_file_mode, parse_pattern, IncludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
     get_cache_dir, get_keys_dir, prune_within, prune_split, unhexlify, \
     Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
     dir_is_tagged, bigint_to_int, ChunkerParams, CompressionSpec, is_slow_msgpack, yes, sysinfo, \
@@ -598,17 +598,43 @@ class Archiver:
 
     helptext = {}
     helptext['patterns'] = textwrap.dedent('''
-        Exclude patterns use a variant of shell pattern syntax, with '*' matching any
-        number of characters, '?' matching any single character, '[...]' matching any
-        single character specified, including ranges, and '[!...]' matching any
-        character not specified.  For the purpose of these patterns, the path
-        separator ('\\' for Windows and '/' on other systems) is not treated
-        specially.  For a path to match a pattern, it must completely match from
-        start to end, or must match from the start to just before a path separator.
-        Except for the root path, paths will never end in the path separator when
-        matching is attempted.  Thus, if a given pattern ends in a path separator, a
-        '*' is appended before matching is attempted.  Patterns with wildcards should
-        be quoted to protect them from shell expansion.
+        Exclusion patterns support two separate styles, fnmatch and regular
+        expressions. If followed by a colon (':') the first two characters of
+        a pattern are used as a style selector. Explicit style selection is necessary
+        when regular expressions are desired or when the desired fnmatch pattern
+        starts with two alphanumeric characters followed by a colon (i.e.
+        `aa:something/*`).
+
+        `Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_ patterns use
+        a variant of shell pattern syntax, with '*' matching any number of
+        characters, '?' matching any single character, '[...]' matching any single
+        character specified, including ranges, and '[!...]' matching any character
+        not specified. The style selector is `fm`. For the purpose of these patterns,
+        the path separator ('\\' for Windows and '/' on other systems) is not treated
+        specially. For a path to match a pattern, it must completely match from start
+        to end, or must match from the start to just before a path separator. Except
+        for the root path, paths will never end in the path separator when matching
+        is attempted. Thus, if a given pattern ends in a path separator, a '*' is
+        appended before matching is attempted.
+
+        Regular expressions similar to those found in Perl are supported with the
+        selection prefix `re:`. Unlike shell patterns regular expressions are not
+        required to match the complete path and any substring match is sufficient. It
+        is strongly recommended to anchor patterns to the start ('^'), to the end
+        ('$') or both. Path separators ('\\' for Windows and '/' on other systems) in
+        paths are always normalized to a forward slash ('/') before applying
+        a pattern. The regular expression syntax is described in the `Python
+        documentation for the re module
+        <https://docs.python.org/3/library/re.html>`_.
+
+        Exclusions can be passed via the command line option `--exclude`. When used
+        from within a shell the patterns should be quoted to protect them from
+        expansion.
+
+        The `--exclude-from` option permits loading exclusion patterns from a text
+        file with one pattern per line. Empty lines as well as lines starting with
+        the number sign ('#') are ignored. The optional style selector prefix is
+        also supported for patterns loaded from a file.
 
         Examples:
 
@@ -624,6 +650,20 @@ class Archiver:
 
         # The file '/home/user/cache/important' is *not* backed up:
         $ borg create -e /home/user/cache/ backup / /home/user/cache/important
+
+        # The contents of directories in '/home' are not backed up when their name
+        # ends in '.tmp'
+        $ borg create --exclude 're:^/home/[^/]+\.tmp/' backup /
+
+        # Load exclusions from file
+        $ cat >exclude.txt <<EOF
+        # Comment line
+        /home/*/junk
+        *.tmp
+        fm:aa:something/*
+        re:^/home/[^/]\.tmp/
+        EOF
+        $ borg create --exclude-from exclude.txt backup /
         ''')
 
     def do_help(self, parser, commands, args):
@@ -812,7 +852,7 @@ class Archiver:
         subparser.add_argument('--filter', dest='output_filter', metavar='STATUSCHARS',
                                help='only display items with the given status characters')
         subparser.add_argument('-e', '--exclude', dest='excludes',
-                               type=ExcludePattern, action='append',
+                               type=parse_pattern, action='append',
                                metavar="PATTERN", help='exclude paths matching PATTERN')
         subparser.add_argument('--exclude-from', dest='exclude_files',
                                type=argparse.FileType('r'), action='append',
@@ -882,7 +922,7 @@ class Archiver:
                                default=False, action='store_true',
                                help='do not actually change any files')
         subparser.add_argument('-e', '--exclude', dest='excludes',
-                               type=ExcludePattern, action='append',
+                               type=parse_pattern, action='append',
                                metavar="PATTERN", help='exclude paths matching PATTERN')
         subparser.add_argument('--exclude-from', dest='exclude_files',
                                type=argparse.FileType('r'), action='append',
diff --git a/borg/helpers.py b/borg/helpers.py
index d994ab25a..23e506f2a 100644
--- a/borg/helpers.py
+++ b/borg/helpers.py
@@ -240,7 +240,7 @@ def load_excludes(fh):
     whitespace is not stripped.
     """
     patterns = (line.rstrip('\r\n') for line in fh if not line.startswith('#'))
-    return [ExcludePattern(pattern) for pattern in patterns if pattern]
+    return [parse_pattern(pattern) for pattern in patterns if pattern]
 
 
 def update_excludes(args):
@@ -266,7 +266,7 @@ def exclude_path(path, patterns):
     """
     for pattern in (patterns or []):
         if pattern.match(path):
-            return isinstance(pattern, ExcludePattern)
+            return isinstance(pattern, (ExcludePattern, ExcludeRegex))
     return False
 
 
@@ -362,6 +362,44 @@ class ExcludePattern(PatternBase):
         return (self.regex.match(path + os.path.sep) is not None)
 
 
+class ExcludeRegex(PatternBase):
+    """Regular expression to exclude.
+    """
+    def _prepare(self, pattern):
+        self.pattern = pattern
+        self.regex = re.compile(pattern)
+
+    def _match(self, path):
+        # Normalize path separators
+        if os.path.sep != '/':
+            path = path.replace(os.path.sep, '/')
+
+        return (self.regex.search(path) is not None)
+
+
+_DEFAULT_PATTERN_STYLE = "fm"
+_PATTERN_STYLES = {
+        "fm": ExcludePattern,
+        "re": ExcludeRegex,
+        }
+
+
+def parse_pattern(pattern):
+    """Read pattern from string and return an instance of the appropriate implementation class.
+    """
+    if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum():
+        (style, pattern) = (pattern[:2], pattern[3:])
+    else:
+        style = _DEFAULT_PATTERN_STYLE
+
+    cls = _PATTERN_STYLES.get(style, None)
+
+    if cls is None:
+        raise ValueError("Unknown pattern style: {}".format(style))
+
+    return cls(pattern)
+
+
 def timestamp(s):
     """Convert a --timestamp=s argument to a datetime object"""
     try:
diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py
index adb05e101..58d20c528 100644
--- a/borg/testsuite/archiver.py
+++ b/borg/testsuite/archiver.py
@@ -489,6 +489,79 @@ class ArchiverTestCase(ArchiverTestCaseBase):
             self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test')
         self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file3'])
 
+    def test_extract_include_exclude_regex(self):
+        self.cmd('init', self.repository_location)
+        self.create_regular_file('file1', size=1024 * 80)
+        self.create_regular_file('file2', size=1024 * 80)
+        self.create_regular_file('file3', size=1024 * 80)
+        self.create_regular_file('file4', size=1024 * 80)
+        self.create_regular_file('file333', size=1024 * 80)
+
+        # Create with regular expression exclusion for file4
+        self.cmd('create', '--exclude=re:input/file4$', self.repository_location + '::test', 'input')
+        with changedir('output'):
+            self.cmd('extract', self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333'])
+        shutil.rmtree('output/input')
+
+        # Extract with regular expression exclusion
+        with changedir('output'):
+            self.cmd('extract', '--exclude=re:file3+', self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2'])
+        shutil.rmtree('output/input')
+
+        # Combine --exclude with fnmatch and regular expression
+        with changedir('output'):
+            self.cmd('extract', '--exclude=input/file2', '--exclude=re:file[01]', self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file3', 'file333'])
+        shutil.rmtree('output/input')
+
+        # Combine --exclude-from and regular expression exclusion
+        with changedir('output'):
+            self.cmd('extract', '--exclude-from=' + self.exclude_file_path, '--exclude=re:file1',
+                     '--exclude=re:file(\\d)\\1\\1$', self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file3'])
+
+    def test_extract_include_exclude_regex_from_file(self):
+        self.cmd('init', self.repository_location)
+        self.create_regular_file('file1', size=1024 * 80)
+        self.create_regular_file('file2', size=1024 * 80)
+        self.create_regular_file('file3', size=1024 * 80)
+        self.create_regular_file('file4', size=1024 * 80)
+        self.create_regular_file('file333', size=1024 * 80)
+        self.create_regular_file('aa:something', size=1024 * 80)
+
+        # Create while excluding using mixed pattern styles
+        with open(self.exclude_file_path, 'wb') as fd:
+            fd.write(b're:input/file4$\n')
+            fd.write(b'fm:*aa:*thing\n')
+
+        self.cmd('create', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test', 'input')
+        with changedir('output'):
+            self.cmd('extract', self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2', 'file3', 'file333'])
+        shutil.rmtree('output/input')
+
+        # Exclude using regular expression
+        with open(self.exclude_file_path, 'wb') as fd:
+            fd.write(b're:file3+\n')
+
+        with changedir('output'):
+            self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file1', 'file2'])
+        shutil.rmtree('output/input')
+
+        # Mixed exclude pattern styles
+        with open(self.exclude_file_path, 'wb') as fd:
+            fd.write(b're:file(\\d)\\1\\1$\n')
+            fd.write(b'fm:nothingwillmatchthis\n')
+            fd.write(b'*/file1\n')
+            fd.write(b're:file2$\n')
+
+        with changedir('output'):
+            self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test')
+        self.assert_equal(sorted(os.listdir('output/input')), ['file3'])
+
     def test_exclude_caches(self):
         self.cmd('init', self.repository_location)
         self.create_regular_file('file1', size=1024 * 80)
diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py
index e766ed441..a61bdd28e 100644
--- a/borg/testsuite/helpers.py
+++ b/borg/testsuite/helpers.py
@@ -10,9 +10,9 @@ import msgpack
 import msgpack.fallback
 
 from ..helpers import adjust_patterns, exclude_path, Location, format_file_size, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
-    prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, \
+    prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, ExcludeRegex, \
     StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \
-    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes
+    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern
 from . import BaseTestCase, environment_variable, FakeInputs
 
 
@@ -160,6 +160,15 @@ class FormatTimedeltaTestCase(BaseTestCase):
         )
 
 
+def check_patterns(files, paths, excludes, expected):
+    """Utility for testing exclusion patterns.
+    """
+    patterns = adjust_patterns(paths, excludes)
+    included = [path for path in files if not exclude_path(path, patterns)]
+
+    assert included == (files if expected is None else expected)
+
+
 @pytest.mark.parametrize("paths, excludes, expected", [
     # "None" means all files, i.e. none excluded
     ([], [], None),
@@ -184,10 +193,44 @@ def test_patterns(paths, excludes, expected):
         '/var/log/messages', '/var/log/dmesg',
     ]
 
-    patterns = adjust_patterns(paths, [ExcludePattern(p) for p in excludes])
-    included = [path for path in files if not exclude_path(path, patterns)]
+    check_patterns(files, paths, [ExcludePattern(p) for p in excludes], expected)
 
-    assert included == (files if expected is None else expected)
+
+@pytest.mark.parametrize("paths, excludes, expected", [
+    # "None" means all files, i.e. none excluded
+    ([], [], None),
+    (['/'], [], None),
+    (['/'], ['.*'], []),
+    (['/'], ['^/'], []),
+    (['/'], ['^abc$'], None),
+    (['/'], ['^(?!/home/)'],
+     ['/home/user/.profile', '/home/user/.bashrc', '/home/user2/.profile',
+      '/home/user2/public_html/index.html']),
+    ])
+def test_patterns_regex(paths, excludes, expected):
+    files = [
+        '/srv/data', '/foo/bar', '/home',
+        '/home/user/.profile', '/home/user/.bashrc',
+        '/home/user2/.profile', '/home/user2/public_html/index.html',
+        '/opt/log/messages.txt', '/opt/log/dmesg.txt',
+    ]
+
+    patterns = []
+
+    for i in excludes:
+        pat = ExcludeRegex(i)
+        assert str(pat) == i
+        assert pat.pattern == i
+        patterns.append(pat)
+
+    check_patterns(files, paths, patterns, expected)
+
+
+def test_regex_pattern():
+    # The forward slash must match the platform-specific path separator
+    assert ExcludeRegex("^/$").match("/")
+    assert ExcludeRegex("^/$").match(os.path.sep)
+    assert not ExcludeRegex(r"^\\$").match("/")
 
 
 @pytest.mark.skipif(sys.platform in ('darwin',), reason='all but OS X test')
@@ -196,31 +239,40 @@ class PatternNonAsciiTestCase(BaseTestCase):
         pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
         assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert not er.match("ba\N{COMBINING ACUTE ACCENT}/foo")
 
     def testDecomposedUnicode(self):
         pattern = 'ba\N{COMBINING ACUTE ACCENT}'
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
         assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert not er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo")
 
     def testInvalidUnicode(self):
         pattern = str(b'ba\x80', 'latin1')
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert not i.match("ba/foo")
         assert i.match(str(b"ba\x80/foo", 'latin1'))
         assert not e.match("ba/foo")
         assert e.match(str(b"ba\x80/foo", 'latin1'))
+        assert not er.match("ba/foo")
+        assert er.match(str(b"ba\x80/foo", 'latin1'))
 
 
 @pytest.mark.skipif(sys.platform not in ('darwin',), reason='OS X test')
@@ -229,31 +281,40 @@ class OSXPatternNormalizationTestCase(BaseTestCase):
         pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
         assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo")
 
     def testDecomposedUnicode(self):
         pattern = 'ba\N{COMBINING ACUTE ACCENT}'
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
         assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
         assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert er.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert er.match("ba\N{COMBINING ACUTE ACCENT}/foo")
 
     def testInvalidUnicode(self):
         pattern = str(b'ba\x80', 'latin1')
         i = IncludePattern(pattern)
         e = ExcludePattern(pattern)
+        er = ExcludeRegex("^{}/foo$".format(pattern))
 
         assert not i.match("ba/foo")
         assert i.match(str(b"ba\x80/foo", 'latin1'))
         assert not e.match("ba/foo")
         assert e.match(str(b"ba\x80/foo", 'latin1'))
+        assert not er.match("ba/foo")
+        assert er.match(str(b"ba\x80/foo", 'latin1'))
 
 
 @pytest.mark.parametrize("lines, expected", [
@@ -271,6 +332,17 @@ class OSXPatternNormalizationTestCase(BaseTestCase):
       "",
       "# EOF"],
      ["/more/data", "/home"]),
+    (["re:.*"], []),
+    (["re:\s"], ["/data/something00.txt", "/more/data", "/home"]),
+    ([r"re:(.)(\1)"], ["/more/data", "/home", "/whitespace/at/end of filename \t "]),
+    (["", "", "",
+      "# This is a test with mixed pattern styles",
+      # Case-insensitive pattern
+      "re:(?i)BAR|ME$",
+      "",
+      "*whitespace*",
+      "fm:*/something00*"],
+     ["/more/data"]),
     ])
 def test_patterns_from_file(tmpdir, lines, expected):
     files = [
@@ -291,6 +363,35 @@ def test_patterns_from_file(tmpdir, lines, expected):
     assert evaluate(str(exclfile)) == (files if expected is None else expected)
 
 
+@pytest.mark.parametrize("pattern, cls", [
+    ("", ExcludePattern),
+
+    # Default style
+    ("*", ExcludePattern),
+    ("/data/*", ExcludePattern),
+
+    # fnmatch style
+    ("fm:", ExcludePattern),
+    ("fm:*", ExcludePattern),
+    ("fm:/data/*", ExcludePattern),
+    ("fm:fm:/data/*", ExcludePattern),
+
+    # Regular expression
+    ("re:", ExcludeRegex),
+    ("re:.*", ExcludeRegex),
+    ("re:^/something/", ExcludeRegex),
+    ("re:re:^/something/", ExcludeRegex),
+    ])
+def test_parse_pattern(pattern, cls):
+    assert isinstance(parse_pattern(pattern), cls)
+
+
+@pytest.mark.parametrize("pattern", ["aa:", "fo:*", "00:", "x1:abc"])
+def test_parse_pattern_error(pattern):
+    with pytest.raises(ValueError):
+        parse_pattern(pattern)
+
+
 def test_compression_specs():
     with pytest.raises(ValueError):
         CompressionSpec('')
diff --git a/docs/usage.rst b/docs/usage.rst
index a3f4bfa25..891aed17e 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -231,6 +231,11 @@ Examples
         ~/src                             \
         --exclude '*.pyc'
 
+    # Backup home directories excluding image thumbnails (i.e. only
+    # /home/*/.thumbnails is excluded, not /home/*/*/.thumbnails)
+    $ borg create /mnt/backup::my-files /home \
+        --exclude 're:^/home/[^/]+/\.thumbnails/'
+
     # Backup the root filesystem into an archive named "root-YYYY-MM-DD"
     # use zlib compression (good, but slow) - default is no compression
     NAME="root-`date +%Y-%m-%d`"