mirror of https://github.com/borgbackup/borg.git
416 lines
14 KiB
Python
416 lines
14 KiB
Python
import argparse
|
|
import fnmatch
|
|
import os.path
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from collections import namedtuple
|
|
from enum import Enum
|
|
|
|
from .helpers import clean_lines, shellpattern
|
|
from .helpers.errors import Error
|
|
|
|
|
|
def parse_patternfile_line(line, roots, ie_commands, fallback):
|
|
"""Parse a pattern-file line and act depending on which command it represents."""
|
|
ie_command = parse_inclexcl_command(line, fallback=fallback)
|
|
if ie_command.cmd is IECommand.RootPath:
|
|
roots.append(ie_command.val)
|
|
elif ie_command.cmd is IECommand.PatternStyle:
|
|
fallback = ie_command.val
|
|
else:
|
|
# it is some kind of include/exclude command
|
|
ie_commands.append(ie_command)
|
|
return fallback
|
|
|
|
|
|
def load_pattern_file(fileobj, roots, ie_commands, fallback=None):
|
|
if fallback is None:
|
|
fallback = ShellPattern # ShellPattern is defined later in this module
|
|
for line in clean_lines(fileobj):
|
|
fallback = parse_patternfile_line(line, roots, ie_commands, fallback)
|
|
|
|
|
|
def load_exclude_file(fileobj, patterns):
|
|
for patternstr in clean_lines(fileobj):
|
|
patterns.append(parse_exclude_pattern(patternstr))
|
|
|
|
|
|
class ArgparsePatternAction(argparse.Action):
|
|
def __init__(self, nargs=1, **kw):
|
|
super().__init__(nargs=nargs, **kw)
|
|
|
|
def __call__(self, parser, args, values, option_string=None):
|
|
parse_patternfile_line(values[0], args.paths, args.patterns, ShellPattern)
|
|
|
|
|
|
class ArgparsePatternFileAction(argparse.Action):
|
|
def __init__(self, nargs=1, **kw):
|
|
super().__init__(nargs=nargs, **kw)
|
|
|
|
def __call__(self, parser, args, values, option_string=None):
|
|
"""Load and parse patterns from a file.
|
|
Lines empty or starting with '#' after stripping whitespace on both line ends are ignored.
|
|
"""
|
|
filename = values[0]
|
|
try:
|
|
with open(filename) as f:
|
|
self.parse(f, args)
|
|
except FileNotFoundError as e:
|
|
raise Error(str(e))
|
|
|
|
def parse(self, fobj, args):
|
|
load_pattern_file(fobj, args.paths, args.patterns)
|
|
|
|
|
|
class ArgparseExcludeFileAction(ArgparsePatternFileAction):
|
|
def parse(self, fobj, args):
|
|
load_exclude_file(fobj, args.patterns)
|
|
|
|
|
|
class PatternMatcher:
|
|
"""Represents a collection of pattern objects to match paths against.
|
|
|
|
*fallback* is a boolean value that *match()* returns if no matching patterns are found.
|
|
|
|
"""
|
|
|
|
def __init__(self, fallback=None):
|
|
self._items = []
|
|
|
|
# Value to return from match function when none of the patterns match.
|
|
self.fallback = fallback
|
|
|
|
# optimizations
|
|
self._path_full_patterns = {} # full path -> return value
|
|
|
|
# indicates whether the last match() call ended on a pattern for which
|
|
# we should recurse into any matching folder. Will be set to True or
|
|
# False when calling match().
|
|
self.recurse_dir = None
|
|
|
|
# whether to recurse into directories when no match is found
|
|
# TODO: allow modification as a config option?
|
|
self.recurse_dir_default = True
|
|
|
|
self.include_patterns = []
|
|
|
|
# TODO: move this info to parse_inclexcl_command and store in PatternBase subclass?
|
|
self.is_include_cmd = {IECommand.Exclude: False, IECommand.ExcludeNoRecurse: False, IECommand.Include: True}
|
|
|
|
def empty(self):
|
|
return not len(self._items) and not len(self._path_full_patterns)
|
|
|
|
def _add(self, pattern, cmd):
|
|
"""*cmd* is an IECommand value."""
|
|
if isinstance(pattern, PathFullPattern):
|
|
key = pattern.pattern # full, normalized path
|
|
self._path_full_patterns[key] = cmd
|
|
else:
|
|
self._items.append((pattern, cmd))
|
|
|
|
def add(self, patterns, cmd):
|
|
"""Add list of patterns to internal list. *cmd* indicates whether the
|
|
pattern is an include/exclude pattern, and whether recursion should be
|
|
done on excluded folders.
|
|
"""
|
|
for pattern in patterns:
|
|
self._add(pattern, cmd)
|
|
|
|
def add_includepaths(self, include_paths):
|
|
"""Used to add inclusion-paths from args.paths (from commandline)."""
|
|
include_patterns = [parse_pattern(p, PathPrefixPattern) for p in include_paths]
|
|
self.add(include_patterns, IECommand.Include)
|
|
self.fallback = not include_patterns
|
|
self.include_patterns = include_patterns
|
|
|
|
def get_unmatched_include_patterns(self):
|
|
"""Note that this only returns patterns added via *add_includepaths* and it
|
|
won't return PathFullPattern patterns as we do not match_count for them.
|
|
"""
|
|
return [p for p in self.include_patterns if p.match_count == 0 and not isinstance(p, PathFullPattern)]
|
|
|
|
def add_inclexcl(self, patterns):
|
|
"""Add list of patterns (of type CmdTuple) to internal list."""
|
|
for pattern, cmd in patterns:
|
|
self._add(pattern, cmd)
|
|
|
|
def match(self, path):
|
|
"""Return True or False depending on whether *path* is matched.
|
|
|
|
If no match is found among the patterns in this matcher, then the value
|
|
in self.fallback is returned (defaults to None).
|
|
|
|
"""
|
|
path = normalize_path(path).lstrip(os.path.sep)
|
|
# do a fast lookup for full path matches (note: we do not count such matches):
|
|
non_existent = object()
|
|
value = self._path_full_patterns.get(path, non_existent)
|
|
|
|
if value is not non_existent:
|
|
# we have a full path match!
|
|
self.recurse_dir = command_recurses_dir(value)
|
|
return self.is_include_cmd[value]
|
|
|
|
# this is the slow way, if we have many patterns in self._items:
|
|
for pattern, cmd in self._items:
|
|
if pattern.match(path, normalize=False):
|
|
self.recurse_dir = pattern.recurse_dir
|
|
return self.is_include_cmd[cmd]
|
|
|
|
# by default we will recurse if there is no match
|
|
self.recurse_dir = self.recurse_dir_default
|
|
return self.fallback
|
|
|
|
|
|
def normalize_path(path):
|
|
"""normalize paths for MacOS (but do nothing on other platforms)"""
|
|
# HFS+ converts paths to a canonical form, so users shouldn't be required to enter an exact match.
|
|
# Windows and Unix filesystems allow different forms, so users always have to enter an exact match.
|
|
return unicodedata.normalize("NFD", path) if sys.platform == "darwin" else path
|
|
|
|
|
|
class PatternBase:
|
|
"""Shared logic for inclusion/exclusion patterns."""
|
|
|
|
PREFIX: str = None
|
|
|
|
def __init__(self, pattern, recurse_dir=False):
|
|
self.pattern_orig = pattern
|
|
self.match_count = 0
|
|
pattern = normalize_path(pattern)
|
|
self._prepare(pattern)
|
|
self.recurse_dir = recurse_dir
|
|
|
|
def match(self, path, normalize=True):
|
|
"""Return a boolean indicating whether *path* is matched by this pattern.
|
|
|
|
If normalize is True (default), the path will get normalized using normalize_path(),
|
|
otherwise it is assumed that it already is normalized using that function.
|
|
"""
|
|
if normalize:
|
|
path = normalize_path(path)
|
|
matches = self._match(path)
|
|
if matches:
|
|
self.match_count += 1
|
|
return matches
|
|
|
|
def __repr__(self):
|
|
return f"{type(self)}({self.pattern})"
|
|
|
|
def __str__(self):
|
|
return self.pattern_orig
|
|
|
|
def _prepare(self, pattern):
|
|
"Should set the value of self.pattern"
|
|
raise NotImplementedError
|
|
|
|
def _match(self, path):
|
|
raise NotImplementedError
|
|
|
|
|
|
class PathFullPattern(PatternBase):
|
|
"""Full match of a path."""
|
|
|
|
PREFIX = "pf"
|
|
|
|
def _prepare(self, pattern):
|
|
self.pattern = os.path.normpath(pattern).lstrip(os.path.sep) # sep at beginning is removed
|
|
|
|
def _match(self, path):
|
|
return path == self.pattern
|
|
|
|
|
|
# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
|
|
# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
|
|
# separator to the end of the path before matching.
|
|
|
|
|
|
class PathPrefixPattern(PatternBase):
|
|
"""Literal files or directories listed on the command line
|
|
for some operations (e.g. extract, but not create).
|
|
If a directory is specified, all paths that start with that
|
|
path match as well. A trailing slash makes no difference.
|
|
"""
|
|
|
|
PREFIX = "pp"
|
|
|
|
def _prepare(self, pattern):
|
|
sep = os.path.sep
|
|
|
|
self.pattern = (os.path.normpath(pattern).rstrip(sep) + sep).lstrip(sep) # sep at beginning is removed
|
|
|
|
def _match(self, path):
|
|
return (path + os.path.sep).startswith(self.pattern)
|
|
|
|
|
|
class FnmatchPattern(PatternBase):
|
|
"""Shell glob patterns to exclude. A trailing slash means to
|
|
exclude the contents of a directory, but not the directory itself.
|
|
"""
|
|
|
|
PREFIX = "fm"
|
|
|
|
def _prepare(self, pattern):
|
|
if pattern.endswith(os.path.sep):
|
|
pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep + "*" + os.path.sep
|
|
else:
|
|
pattern = os.path.normpath(pattern) + os.path.sep + "*"
|
|
|
|
self.pattern = pattern.lstrip(os.path.sep) # sep at beginning is removed
|
|
|
|
# fnmatch and re.match both cache compiled regular expressions.
|
|
# Nevertheless, this is about 10 times faster.
|
|
self.regex = re.compile(fnmatch.translate(self.pattern))
|
|
|
|
def _match(self, path):
|
|
return self.regex.match(path + os.path.sep) is not None
|
|
|
|
|
|
class ShellPattern(PatternBase):
|
|
"""Shell glob patterns to exclude. A trailing slash means to
|
|
exclude the contents of a directory, but not the directory itself.
|
|
"""
|
|
|
|
PREFIX = "sh"
|
|
|
|
def _prepare(self, pattern):
|
|
sep = os.path.sep
|
|
|
|
if pattern.endswith(sep):
|
|
pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
|
|
else:
|
|
pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
|
|
|
|
self.pattern = pattern.lstrip(sep) # sep at beginning is removed
|
|
self.regex = re.compile(shellpattern.translate(self.pattern))
|
|
|
|
def _match(self, path):
|
|
return self.regex.match(path + os.path.sep) is not None
|
|
|
|
|
|
class RegexPattern(PatternBase):
|
|
"""Regular expression to exclude."""
|
|
|
|
PREFIX = "re"
|
|
|
|
def _prepare(self, pattern):
|
|
self.pattern = pattern # sep at beginning is NOT removed
|
|
self.regex = re.compile(pattern)
|
|
|
|
def _match(self, path):
|
|
# Normalize path separators
|
|
if os.path.sep != "/":
|
|
path = path.replace(os.path.sep, "/")
|
|
|
|
return self.regex.search(path) is not None
|
|
|
|
|
|
_PATTERN_CLASSES = {FnmatchPattern, PathFullPattern, PathPrefixPattern, RegexPattern, ShellPattern}
|
|
|
|
_PATTERN_CLASS_BY_PREFIX = {i.PREFIX: i for i in _PATTERN_CLASSES}
|
|
|
|
CmdTuple = namedtuple("CmdTuple", "val cmd")
|
|
|
|
|
|
class IECommand(Enum):
|
|
"""A command that an InclExcl file line can represent."""
|
|
|
|
RootPath = 1
|
|
PatternStyle = 2
|
|
Include = 3
|
|
Exclude = 4
|
|
ExcludeNoRecurse = 5
|
|
|
|
|
|
def command_recurses_dir(cmd):
|
|
# TODO?: raise error or return None if *cmd* is RootPath or PatternStyle
|
|
return cmd not in [IECommand.ExcludeNoRecurse]
|
|
|
|
|
|
def get_pattern_class(prefix):
|
|
try:
|
|
return _PATTERN_CLASS_BY_PREFIX[prefix]
|
|
except KeyError:
|
|
raise ValueError(f"Unknown pattern style: {prefix}") from None
|
|
|
|
|
|
def parse_pattern(pattern, fallback=FnmatchPattern, recurse_dir=True):
|
|
"""Read pattern from string and return an instance of the appropriate implementation class."""
|
|
if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum():
|
|
(style, pattern) = (pattern[:2], pattern[3:])
|
|
cls = get_pattern_class(style)
|
|
else:
|
|
cls = fallback
|
|
return cls(pattern, recurse_dir)
|
|
|
|
|
|
def parse_exclude_pattern(pattern_str, fallback=FnmatchPattern):
|
|
"""Read pattern from string and return an instance of the appropriate implementation class."""
|
|
epattern_obj = parse_pattern(pattern_str, fallback, recurse_dir=False)
|
|
return CmdTuple(epattern_obj, IECommand.ExcludeNoRecurse)
|
|
|
|
|
|
def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern):
|
|
"""Read a --patterns-from command from string and return a CmdTuple object."""
|
|
|
|
cmd_prefix_map = {
|
|
"-": IECommand.Exclude,
|
|
"!": IECommand.ExcludeNoRecurse,
|
|
"+": IECommand.Include,
|
|
"R": IECommand.RootPath,
|
|
"r": IECommand.RootPath,
|
|
"P": IECommand.PatternStyle,
|
|
"p": IECommand.PatternStyle,
|
|
}
|
|
if not cmd_line_str:
|
|
raise argparse.ArgumentTypeError("A pattern/command must not be empty.")
|
|
|
|
cmd = cmd_prefix_map.get(cmd_line_str[0])
|
|
if cmd is None:
|
|
raise argparse.ArgumentTypeError("A pattern/command must start with any of: %s" % ", ".join(cmd_prefix_map))
|
|
|
|
# remaining text on command-line following the command character
|
|
remainder_str = cmd_line_str[1:].lstrip()
|
|
if not remainder_str:
|
|
raise argparse.ArgumentTypeError("A pattern/command must have a value part.")
|
|
|
|
if cmd is IECommand.RootPath:
|
|
# TODO: validate string?
|
|
val = remainder_str
|
|
elif cmd is IECommand.PatternStyle:
|
|
# then remainder_str is something like 're' or 'sh'
|
|
try:
|
|
val = get_pattern_class(remainder_str)
|
|
except ValueError:
|
|
raise argparse.ArgumentTypeError(f"Invalid pattern style: {remainder_str}")
|
|
else:
|
|
# determine recurse_dir based on command type
|
|
recurse_dir = command_recurses_dir(cmd)
|
|
val = parse_pattern(remainder_str, fallback, recurse_dir)
|
|
|
|
return CmdTuple(val, cmd)
|
|
|
|
|
|
def get_regex_from_pattern(pattern: str) -> str:
|
|
"""
|
|
return a regular expression string corresponding to the given pattern string.
|
|
|
|
the allowed pattern types are similar to the ones implemented by PatternBase subclasses,
|
|
but here we rather do generic string matching, not specialised filesystem paths matching.
|
|
"""
|
|
if len(pattern) > 2 and pattern[2] == ":" and pattern[:2] in {"sh", "re", "id"}:
|
|
(style, pattern) = (pattern[:2], pattern[3:])
|
|
else:
|
|
(style, pattern) = ("id", pattern) # "identical" match is the default
|
|
if style == "sh":
|
|
# (?ms) (meaning re.MULTILINE and re.DOTALL) are not desired here.
|
|
regex = shellpattern.translate(pattern, match_end="").removeprefix("(?ms)")
|
|
elif style == "re":
|
|
regex = pattern
|
|
elif style == "id":
|
|
regex = re.escape(pattern)
|
|
else:
|
|
raise NotImplementedError
|
|
return regex
|