From ebd928795e8a45f82d571a56a202eb6462ada6ca Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 24 Mar 2017 04:30:03 +0100 Subject: [PATCH 1/3] add PathFullPattern not really a pattern (as in potentially having any variable parts) - it just does a full, precise match, after the usual normalizations. the reason for adding this is mainly for later optimizations, e.g. via set membership check, so that a lot of such PathFullPatterns can be "matched" within O(1) time. --- src/borg/helpers.py | 12 ++++++++++++ src/borg/testsuite/helpers.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 43ae2d26a..6252a5e7c 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -518,6 +518,17 @@ class PatternBase: raise NotImplementedError +class PathFullPattern(PatternBase): + """Full match of a path.""" + PREFIX = "pf" + + def _prepare(self, pattern): + self.pattern = os.path.normpath(pattern) + + def _match(self, path): + return path == self.pattern + + # For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path # or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path # separator to the end of the path before matching. @@ -600,6 +611,7 @@ class RegexPattern(PatternBase): _PATTERN_STYLES = set([ FnmatchPattern, + PathFullPattern, PathPrefixPattern, RegexPattern, ShellPattern, diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 727f1628a..19c5e9c51 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -25,7 +25,8 @@ from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless from ..helpers import load_exclude_file, load_pattern_file from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2 -from ..helpers import parse_pattern, PatternMatcher, RegexPattern, PathPrefixPattern, FnmatchPattern, ShellPattern +from ..helpers import parse_pattern, PatternMatcher +from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern from ..helpers import swidth_slice from ..helpers import chunkit from ..helpers import safe_ns, safe_s @@ -254,6 +255,35 @@ def check_patterns(files, pattern, expected): assert matched == (files if expected is None else expected) +@pytest.mark.parametrize("pattern, expected", [ + # "None" means all files, i.e. all match the given pattern + ("/", []), + ("/home", ["/home"]), + ("/home///", ["/home"]), + ("/./home", ["/home"]), + ("/home/user", ["/home/user"]), + ("/home/user2", ["/home/user2"]), + ("/home/user/.bashrc", ["/home/user/.bashrc"]), + ]) +def test_patterns_full(pattern, expected): + files = ["/home", "/home/user", "/home/user2", "/home/user/.bashrc", ] + + check_patterns(files, PathFullPattern(pattern), expected) + + +@pytest.mark.parametrize("pattern, expected", [ + # "None" means all files, i.e. all match the given pattern + ("", []), + ("relative", []), + ("relative/path/", ["relative/path"]), + ("relative/path", ["relative/path"]), + ]) +def test_patterns_full_relative(pattern, expected): + files = ["relative/path", "relative/path2", ] + + check_patterns(files, PathFullPattern(pattern), expected) + + @pytest.mark.parametrize("pattern, expected", [ # "None" means all files, i.e. all match the given pattern ("/", None), From 93feb754117d1d24aa0db7738b777de966308032 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 24 Mar 2017 06:06:02 +0100 Subject: [PATCH 2/3] optimize PathFullPattern matching for O(1) time For a borg create run using a patterns file with 15.000 PathFullPattern excludes that excluded almost all files in the input data set: - before this optimization: ~60s - after this optimization: ~1s --- src/borg/helpers.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 6252a5e7c..2e343e4e7 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -451,23 +451,42 @@ class PatternMatcher: # Value to return from match function when none of the patterns match. self.fallback = fallback + # optimizations + self._path_full_patterns = {} # full path -> return value + def empty(self): - return not len(self._items) + return not len(self._items) and not len(self._path_full_patterns) + + def _add(self, pattern, value): + if isinstance(pattern, PathFullPattern): + key = pattern.pattern # full, normalized path + self._path_full_patterns[key] = value + else: + self._items.append((pattern, value)) def add(self, patterns, value): """Add list of patterns to internal list. The given value is returned from the match function when one of the given patterns matches. """ - self._items.extend((i, value) for i in patterns) + for pattern in patterns: + self._add(pattern, value) def add_inclexcl(self, patterns): """Add list of patterns (of type InclExclPattern) to internal list. The patterns ptype member is returned from the match function when one of the given patterns matches. """ - self._items.extend(patterns) + for pattern, pattern_type in patterns: + self._add(pattern, pattern_type) def match(self, path): path = normalize_path(path) + # do a fast lookup for full path matches (note: we do not count such matches): + non_existent = object() + value = self._path_full_patterns.get(path, non_existent) + if value is not non_existent: + # we have a full path match! + return value + # this is the slow way, if we have many patterns in self._items: for (pattern, value) in self._items: if pattern.match(path, normalize=False): return value From cb6bfdf4d656d1db37c2c7ffdb0bbbcb345b9dfa Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 00:26:57 +0100 Subject: [PATCH 3/3] add docs for path full-match patterns --- src/borg/archiver.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index ff448bd60..15c6dd9b8 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1604,11 +1604,27 @@ class Archiver: regular expression syntax is described in the `Python documentation for the re module `_. - Prefix path, selector `pp:` + Path prefix, selector `pp:` This pattern style is useful to match whole sub-directories. The pattern `pp:/data/bar` matches `/data/bar` and everything therein. + Path full-match, selector `pf:` + + This pattern style is useful to match whole paths. + This is kind of a pseudo pattern as it can not have any variable or + unspecified parts - the full, precise path must be given. + `pf:/data/foo.txt` matches `/data/foo.txt` only. + + Implementation note: this is implemented via very time-efficient O(1) + hashtable lookups (this means you can have huge amounts of such patterns + without impacting performance much). + Due to that, this kind of pattern does not respect any context or order. + If you use such a pattern to include a file, it will always be included + (if the directory recursion encounters it). + Other include/exclude patterns that would normally match will be ignored. + Same logic applies for exclude. + Exclusions can be passed via the command line option `--exclude`. When used from within a shell the patterns should be quoted to protect them from expansion.