borg/src/borg/patterns.py

416 lines
14 KiB
Python

import argparse
import fnmatch
import os.path
import re
import sys
import unicodedata
from collections import namedtuple
from enum import Enum
from .helpers import clean_lines, shellpattern
from .helpers.errors import Error
def parse_patternfile_line(line, roots, ie_commands, fallback):
"""Parse a pattern-file line and act depending on which command it represents."""
ie_command = parse_inclexcl_command(line, fallback=fallback)
if ie_command.cmd is IECommand.RootPath:
roots.append(ie_command.val)
elif ie_command.cmd is IECommand.PatternStyle:
fallback = ie_command.val
else:
# it is some kind of include/exclude command
ie_commands.append(ie_command)
return fallback
def load_pattern_file(fileobj, roots, ie_commands, fallback=None):
if fallback is None:
fallback = ShellPattern # ShellPattern is defined later in this module
for line in clean_lines(fileobj):
fallback = parse_patternfile_line(line, roots, ie_commands, fallback)
def load_exclude_file(fileobj, patterns):
for patternstr in clean_lines(fileobj):
patterns.append(parse_exclude_pattern(patternstr))
class ArgparsePatternAction(argparse.Action):
def __init__(self, nargs=1, **kw):
super().__init__(nargs=nargs, **kw)
def __call__(self, parser, args, values, option_string=None):
parse_patternfile_line(values[0], args.paths, args.patterns, ShellPattern)
class ArgparsePatternFileAction(argparse.Action):
def __init__(self, nargs=1, **kw):
super().__init__(nargs=nargs, **kw)
def __call__(self, parser, args, values, option_string=None):
"""Load and parse patterns from a file.
Lines empty or starting with '#' after stripping whitespace on both line ends are ignored.
"""
filename = values[0]
try:
with open(filename) as f:
self.parse(f, args)
except FileNotFoundError as e:
raise Error(str(e))
def parse(self, fobj, args):
load_pattern_file(fobj, args.paths, args.patterns)
class ArgparseExcludeFileAction(ArgparsePatternFileAction):
def parse(self, fobj, args):
load_exclude_file(fobj, args.patterns)
class PatternMatcher:
"""Represents a collection of pattern objects to match paths against.
*fallback* is a boolean value that *match()* returns if no matching patterns are found.
"""
def __init__(self, fallback=None):
self._items = []
# Value to return from match function when none of the patterns match.
self.fallback = fallback
# optimizations
self._path_full_patterns = {} # full path -> return value
# indicates whether the last match() call ended on a pattern for which
# we should recurse into any matching folder. Will be set to True or
# False when calling match().
self.recurse_dir = None
# whether to recurse into directories when no match is found
# TODO: allow modification as a config option?
self.recurse_dir_default = True
self.include_patterns = []
# TODO: move this info to parse_inclexcl_command and store in PatternBase subclass?
self.is_include_cmd = {IECommand.Exclude: False, IECommand.ExcludeNoRecurse: False, IECommand.Include: True}
def empty(self):
return not len(self._items) and not len(self._path_full_patterns)
def _add(self, pattern, cmd):
"""*cmd* is an IECommand value."""
if isinstance(pattern, PathFullPattern):
key = pattern.pattern # full, normalized path
self._path_full_patterns[key] = cmd
else:
self._items.append((pattern, cmd))
def add(self, patterns, cmd):
"""Add list of patterns to internal list. *cmd* indicates whether the
pattern is an include/exclude pattern, and whether recursion should be
done on excluded folders.
"""
for pattern in patterns:
self._add(pattern, cmd)
def add_includepaths(self, include_paths):
"""Used to add inclusion-paths from args.paths (from commandline)."""
include_patterns = [parse_pattern(p, PathPrefixPattern) for p in include_paths]
self.add(include_patterns, IECommand.Include)
self.fallback = not include_patterns
self.include_patterns = include_patterns
def get_unmatched_include_patterns(self):
"""Note that this only returns patterns added via *add_includepaths* and it
won't return PathFullPattern patterns as we do not match_count for them.
"""
return [p for p in self.include_patterns if p.match_count == 0 and not isinstance(p, PathFullPattern)]
def add_inclexcl(self, patterns):
"""Add list of patterns (of type CmdTuple) to internal list."""
for pattern, cmd in patterns:
self._add(pattern, cmd)
def match(self, path):
"""Return True or False depending on whether *path* is matched.
If no match is found among the patterns in this matcher, then the value
in self.fallback is returned (defaults to None).
"""
path = normalize_path(path).lstrip(os.path.sep)
# do a fast lookup for full path matches (note: we do not count such matches):
non_existent = object()
value = self._path_full_patterns.get(path, non_existent)
if value is not non_existent:
# we have a full path match!
self.recurse_dir = command_recurses_dir(value)
return self.is_include_cmd[value]
# this is the slow way, if we have many patterns in self._items:
for pattern, cmd in self._items:
if pattern.match(path, normalize=False):
self.recurse_dir = pattern.recurse_dir
return self.is_include_cmd[cmd]
# by default we will recurse if there is no match
self.recurse_dir = self.recurse_dir_default
return self.fallback
def normalize_path(path):
"""normalize paths for MacOS (but do nothing on other platforms)"""
# HFS+ converts paths to a canonical form, so users shouldn't be required to enter an exact match.
# Windows and Unix filesystems allow different forms, so users always have to enter an exact match.
return unicodedata.normalize("NFD", path) if sys.platform == "darwin" else path
class PatternBase:
"""Shared logic for inclusion/exclusion patterns."""
PREFIX: str = None
def __init__(self, pattern, recurse_dir=False):
self.pattern_orig = pattern
self.match_count = 0
pattern = normalize_path(pattern)
self._prepare(pattern)
self.recurse_dir = recurse_dir
def match(self, path, normalize=True):
"""Return a boolean indicating whether *path* is matched by this pattern.
If normalize is True (default), the path will get normalized using normalize_path(),
otherwise it is assumed that it already is normalized using that function.
"""
if normalize:
path = normalize_path(path)
matches = self._match(path)
if matches:
self.match_count += 1
return matches
def __repr__(self):
return f"{type(self)}({self.pattern})"
def __str__(self):
return self.pattern_orig
def _prepare(self, pattern):
"Should set the value of self.pattern"
raise NotImplementedError
def _match(self, path):
raise NotImplementedError
class PathFullPattern(PatternBase):
"""Full match of a path."""
PREFIX = "pf"
def _prepare(self, pattern):
self.pattern = os.path.normpath(pattern).lstrip(os.path.sep) # sep at beginning is removed
def _match(self, path):
return path == self.pattern
# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
# separator to the end of the path before matching.
class PathPrefixPattern(PatternBase):
"""Literal files or directories listed on the command line
for some operations (e.g. extract, but not create).
If a directory is specified, all paths that start with that
path match as well. A trailing slash makes no difference.
"""
PREFIX = "pp"
def _prepare(self, pattern):
sep = os.path.sep
self.pattern = (os.path.normpath(pattern).rstrip(sep) + sep).lstrip(sep) # sep at beginning is removed
def _match(self, path):
return (path + os.path.sep).startswith(self.pattern)
class FnmatchPattern(PatternBase):
"""Shell glob patterns to exclude. A trailing slash means to
exclude the contents of a directory, but not the directory itself.
"""
PREFIX = "fm"
def _prepare(self, pattern):
if pattern.endswith(os.path.sep):
pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep + "*" + os.path.sep
else:
pattern = os.path.normpath(pattern) + os.path.sep + "*"
self.pattern = pattern.lstrip(os.path.sep) # sep at beginning is removed
# fnmatch and re.match both cache compiled regular expressions.
# Nevertheless, this is about 10 times faster.
self.regex = re.compile(fnmatch.translate(self.pattern))
def _match(self, path):
return self.regex.match(path + os.path.sep) is not None
class ShellPattern(PatternBase):
"""Shell glob patterns to exclude. A trailing slash means to
exclude the contents of a directory, but not the directory itself.
"""
PREFIX = "sh"
def _prepare(self, pattern):
sep = os.path.sep
if pattern.endswith(sep):
pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
else:
pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
self.pattern = pattern.lstrip(sep) # sep at beginning is removed
self.regex = re.compile(shellpattern.translate(self.pattern))
def _match(self, path):
return self.regex.match(path + os.path.sep) is not None
class RegexPattern(PatternBase):
"""Regular expression to exclude."""
PREFIX = "re"
def _prepare(self, pattern):
self.pattern = pattern # sep at beginning is NOT removed
self.regex = re.compile(pattern)
def _match(self, path):
# Normalize path separators
if os.path.sep != "/":
path = path.replace(os.path.sep, "/")
return self.regex.search(path) is not None
_PATTERN_CLASSES = {FnmatchPattern, PathFullPattern, PathPrefixPattern, RegexPattern, ShellPattern}
_PATTERN_CLASS_BY_PREFIX = {i.PREFIX: i for i in _PATTERN_CLASSES}
CmdTuple = namedtuple("CmdTuple", "val cmd")
class IECommand(Enum):
"""A command that an InclExcl file line can represent."""
RootPath = 1
PatternStyle = 2
Include = 3
Exclude = 4
ExcludeNoRecurse = 5
def command_recurses_dir(cmd):
# TODO?: raise error or return None if *cmd* is RootPath or PatternStyle
return cmd not in [IECommand.ExcludeNoRecurse]
def get_pattern_class(prefix):
try:
return _PATTERN_CLASS_BY_PREFIX[prefix]
except KeyError:
raise ValueError(f"Unknown pattern style: {prefix}") from None
def parse_pattern(pattern, fallback=FnmatchPattern, recurse_dir=True):
"""Read pattern from string and return an instance of the appropriate implementation class."""
if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum():
(style, pattern) = (pattern[:2], pattern[3:])
cls = get_pattern_class(style)
else:
cls = fallback
return cls(pattern, recurse_dir)
def parse_exclude_pattern(pattern_str, fallback=FnmatchPattern):
"""Read pattern from string and return an instance of the appropriate implementation class."""
epattern_obj = parse_pattern(pattern_str, fallback, recurse_dir=False)
return CmdTuple(epattern_obj, IECommand.ExcludeNoRecurse)
def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern):
"""Read a --patterns-from command from string and return a CmdTuple object."""
cmd_prefix_map = {
"-": IECommand.Exclude,
"!": IECommand.ExcludeNoRecurse,
"+": IECommand.Include,
"R": IECommand.RootPath,
"r": IECommand.RootPath,
"P": IECommand.PatternStyle,
"p": IECommand.PatternStyle,
}
if not cmd_line_str:
raise argparse.ArgumentTypeError("A pattern/command must not be empty.")
cmd = cmd_prefix_map.get(cmd_line_str[0])
if cmd is None:
raise argparse.ArgumentTypeError("A pattern/command must start with any of: %s" % ", ".join(cmd_prefix_map))
# remaining text on command-line following the command character
remainder_str = cmd_line_str[1:].lstrip()
if not remainder_str:
raise argparse.ArgumentTypeError("A pattern/command must have a value part.")
if cmd is IECommand.RootPath:
# TODO: validate string?
val = remainder_str
elif cmd is IECommand.PatternStyle:
# then remainder_str is something like 're' or 'sh'
try:
val = get_pattern_class(remainder_str)
except ValueError:
raise argparse.ArgumentTypeError(f"Invalid pattern style: {remainder_str}")
else:
# determine recurse_dir based on command type
recurse_dir = command_recurses_dir(cmd)
val = parse_pattern(remainder_str, fallback, recurse_dir)
return CmdTuple(val, cmd)
def get_regex_from_pattern(pattern: str) -> str:
"""
return a regular expression string corresponding to the given pattern string.
the allowed pattern types are similar to the ones implemented by PatternBase subclasses,
but here we rather do generic string matching, not specialised filesystem paths matching.
"""
if len(pattern) > 2 and pattern[2] == ":" and pattern[:2] in {"sh", "re", "id"}:
(style, pattern) = (pattern[:2], pattern[3:])
else:
(style, pattern) = ("id", pattern) # "identical" match is the default
if style == "sh":
# (?ms) (meaning re.MULTILINE and re.DOTALL) are not desired here.
regex = shellpattern.translate(pattern, match_end="").removeprefix("(?ms)")
elif style == "re":
regex = pattern
elif style == "id":
regex = re.escape(pattern)
else:
raise NotImplementedError
return regex