implement pattern support for --match-archives, fixes #6504

also:
- rename --glob-archives option to --match-archives (short: -a, unchanged)
- globbing patterns now need sh: prefix
- regex patterns need re: prefix
- "identical" match "patterns" use an id: prefix
- new default style is id: pattern (--glob-archives used sh: glob pattern)
- source code: glob -> match, GLOB -> PATTERN
This commit is contained in:
Thomas Waldmann 2022-09-16 00:37:18 +02:00
parent f5df35b36e
commit 4493d396e6
15 changed files with 121 additions and 44 deletions

View File

@ -1656,18 +1656,18 @@ class ArchiveChecker:
self.possibly_superseded = set()
def check(
self, repository, repair=False, first=0, last=0, sort_by="", glob=None, verify_data=False, save_space=False
self, repository, repair=False, first=0, last=0, sort_by="", match=None, verify_data=False, save_space=False
):
"""Perform a set of checks on 'repository'
:param repair: enable repair mode, write updated or corrected data into repository
:param first/last/sort_by: only check this number of first/last archives ordered by sort_by
:param glob: only check archives matching this glob
:param match: only check archives matching this pattern
:param verify_data: integrity verification of data referenced by archives
:param save_space: Repository.commit(save_space)
"""
logger.info("Starting archive consistency check...")
self.check_all = not any((first, last, glob))
self.check_all = not any((first, last, match))
self.repair = repair
self.repository = repository
self.init_chunks()
@ -1690,7 +1690,7 @@ class ArchiveChecker:
self.error_found = True
del self.chunks[Manifest.MANIFEST_ID]
self.manifest = self.rebuild_manifest()
self.rebuild_refcounts(glob=glob, first=first, last=last, sort_by=sort_by)
self.rebuild_refcounts(match=match, first=first, last=last, sort_by=sort_by)
self.orphan_chunks_check()
self.finish(save_space=save_space)
if self.error_found:
@ -1886,7 +1886,7 @@ class ArchiveChecker:
logger.info("Manifest rebuild complete.")
return manifest
def rebuild_refcounts(self, first=0, last=0, sort_by="", glob=None):
def rebuild_refcounts(self, first=0, last=0, sort_by="", match=None):
"""Rebuild object reference counts by walking the metadata
Missing and/or incorrect data is repaired when detected
@ -2080,10 +2080,10 @@ class ArchiveChecker:
i += 1
sort_by = sort_by.split(",")
if any((first, last, glob)):
archive_infos = self.manifest.archives.list(sort_by=sort_by, glob=glob, first=first, last=last)
if glob and not archive_infos:
logger.warning("--glob-archives %s does not match any archives", glob)
if any((first, last, match)):
archive_infos = self.manifest.archives.list(sort_by=sort_by, match=match, first=first, last=last)
if match and not archive_infos:
logger.warning("--match-archives %s does not match any archives", match)
if first and len(archive_infos) < first:
logger.warning("--first %d archives: only found %d archives", first, len(archive_infos))
if last and len(archive_infos) < last:

View File

@ -410,7 +410,7 @@ class Archiver(
replace_placeholders.override("now", DatetimeWrapper(args.timestamp))
replace_placeholders.override("utcnow", DatetimeWrapper(args.timestamp.astimezone(timezone.utc)))
args.location = args.location.with_timestamp(args.timestamp)
for name in "name", "other_name", "newname", "glob_archives", "comment":
for name in "name", "other_name", "newname", "match_archives", "comment":
value = getattr(args, name, None)
if value is not None:
setattr(args, name, replace_placeholders(value))

View File

@ -360,11 +360,11 @@ def define_archive_filters_group(subparser, *, sort_by=True, first_last=True):
group = filters_group.add_mutually_exclusive_group()
group.add_argument(
"-a",
"--glob-archives",
metavar="GLOB",
dest="glob_archives",
"--match-archives",
metavar="PATTERN",
dest="match_archives",
action=Highlander,
help="only consider archive names matching the glob. " 'sh: rules apply, see "borg help patterns".',
help='only consider archive names matching the pattern. see "borg help match-archives".',
)
if sort_by:

View File

@ -31,9 +31,9 @@ class CheckMixIn:
env_var_override="BORG_CHECK_I_KNOW_WHAT_I_AM_DOING",
):
return EXIT_ERROR
if args.repo_only and any((args.verify_data, args.first, args.last, args.glob_archives)):
if args.repo_only and any((args.verify_data, args.first, args.last, args.match_archives)):
self.print_error(
"--repository-only contradicts --first, --last, -a / --glob-archives " " and --verify-data arguments."
"--repository-only contradicts --first, --last, -a / --match-archives and --verify-data arguments."
)
return EXIT_ERROR
if args.repair and args.max_duration:
@ -55,7 +55,7 @@ class CheckMixIn:
first=args.first,
last=args.last,
sort_by=args.sort_by or "ts",
glob=args.glob_archives,
match=args.match_archives,
verify_data=args.verify_data,
save_space=args.save_space,
):

View File

@ -23,9 +23,9 @@ class DeleteMixIn:
archive_names = tuple(x.name for x in manifest.archives.list_considering(args))
if not archive_names:
return self.exit_code
if args.glob_archives is None and args.first == 0 and args.last == 0:
if args.match_archives is None and args.first == 0 and args.last == 0:
self.print_error(
"Aborting: if you really want to delete all archives, please use -a '*' "
"Aborting: if you really want to delete all archives, please use -a 'sh:*' "
"or just delete the whole repository (might be much faster)."
)
return EXIT_ERROR
@ -114,8 +114,8 @@ class DeleteMixIn:
that is how much your repository will shrink.
Please note that the "All archives" stats refer to the state after deletion.
You can delete multiple archives by specifying a matching shell pattern,
using the ``--glob-archives GLOB`` option (for more info on these patterns,
You can delete multiple archives by specifying a matching pattern,
using the ``--match-archives PATTERN`` option (for more info on these patterns,
see :ref:`borg_patterns`).
Always first use ``--dry-run --list`` to see what would be deleted.

View File

@ -244,9 +244,38 @@ class HelpMixIn:
This allows you to share the same patterns between multiple repositories
without needing to specify them on the command line.\n\n"""
)
helptext["match-archives"] = textwrap.dedent(
"""
The ``--match-archives`` option matches a given pattern against the list of all archive
names in the repository.
It uses pattern styles similar to the ones described by ``borg help patterns``:
Identical match pattern, selector ``id:`` (default)
Simple string match, must fully match exactly as given.
Shell-style patterns, selector ``sh:``
Match like on the shell, wildcards like `*` and `?` work.
`Regular expressions <https://docs.python.org/3/library/re.html>`_, selector ``re:``
Full regular expression support.
This is very powerful, but can also get rather complicated.
Examples::
# id: style
borg delete --match-archives 'id:archive-with-crap'
borg delete -a 'id:archive-with-crap' # same, using short option
borg delete -a 'archive-with-crap' # same, because 'id:' is the default
# sh: style
borg delete -a 'sh:home-kenny-*'
# re: style
borg delete -a 're:pc[123]-home-(user1|user2)-2022-09-.*'\n\n"""
)
helptext["placeholders"] = textwrap.dedent(
"""
Repository URLs, ``--name``, ``-a`` / ``--glob-archives``, ``--comment``
Repository URLs, ``--name``, ``-a`` / ``--match-archives``, ``--comment``
and ``--remote-path`` values support these placeholders:
{hostname}
@ -292,7 +321,7 @@ class HelpMixIn:
borg create /path/to/repo::{hostname}-{user}-{utcnow} ...
borg create /path/to/repo::{hostname}-{now:%Y-%m-%d_%H:%M:%S%z} ...
borg prune -a '{hostname}-*' ...
borg prune -a 'sh:{hostname}-*' ...
.. note::
systemd uses a difficult, non-standard syntax for command lines in unit files (refer to

View File

@ -84,7 +84,7 @@ class PruneMixIn:
return self.exit_code
checkpoint_re = r"\.checkpoint(\.\d+)?"
archives_checkpoints = manifest.archives.list(
glob=args.glob_archives,
match=args.match_archives,
consider_checkpoints=True,
match_end=r"(%s)?\Z" % checkpoint_re,
sort_by=["ts"],
@ -191,7 +191,7 @@ class PruneMixIn:
archive (and thus still needed). Checkpoint archives are not considered when
comparing archive counts against the retention limits (``--keep-X``).
If you use --glob-archives (-a), then only archives that match the GLOB are
If you use --match-archives (-a), then only archives that match the pattern are
considered for deletion and only those archives count towards the totals
specified by the rules.
Otherwise, *all* archives in the repository are candidates for deletion!
@ -200,7 +200,7 @@ class PruneMixIn:
If you have multiple sequences of archives with different data sets (e.g.
from different machines) in one shared repository, use one prune call per
data set that matches only the respective archives using the --glob-archives
data set that matches only the respective archives using the --match-archives
(-a) option.
The ``--keep-within`` option takes an argument of the form "<int><char>",

View File

@ -11,12 +11,12 @@ from .logger import create_logger
logger = create_logger()
from .helpers import shellpattern
from .constants import * # NOQA
from .helpers.datastruct import StableDict
from .helpers.parseformat import bin_to_hex
from .helpers.time import parse_timestamp
from .helpers.errors import Error
from .patterns import get_regex_from_pattern
from .repoobj import RepoObj
@ -74,12 +74,20 @@ class Archives(abc.MutableMapping):
del self._archives[name]
def list(
self, *, glob=None, match_end=r"\Z", sort_by=(), consider_checkpoints=True, first=None, last=None, reverse=False
self,
*,
match=None,
match_end=r"\Z",
sort_by=(),
consider_checkpoints=True,
first=None,
last=None,
reverse=False
):
"""
Return list of ArchiveInfo instances according to the parameters.
First match *glob* (considering *match_end*), then *sort_by*.
First match *match* (considering *match_end*), then *sort_by*.
Apply *first* and *last* filters, and then possibly *reverse* the list.
*sort_by* is a list of sort keys applied in reverse order.
@ -90,7 +98,8 @@ class Archives(abc.MutableMapping):
"""
if isinstance(sort_by, (str, bytes)):
raise TypeError("sort_by must be a sequence of str")
regex = re.compile(shellpattern.translate(glob or "*", match_end=match_end))
regex = get_regex_from_pattern(match or "re:.*")
regex = re.compile(regex + match_end)
archives = [x for x in self.values() if regex.match(x.name) is not None]
if not consider_checkpoints:
archives = [x for x in archives if ".checkpoint" not in x.name]
@ -106,18 +115,18 @@ class Archives(abc.MutableMapping):
def list_considering(self, args):
"""
get a list of archives, considering --first/last/prefix/glob-archives/sort/consider-checkpoints cmdline args
get a list of archives, considering --first/last/prefix/match-archives/sort/consider-checkpoints cmdline args
"""
name = getattr(args, "name", None)
consider_checkpoints = getattr(args, "consider_checkpoints", None)
if name is not None:
raise Error(
"Giving a specific name is incompatible with options --first, --last, -a / --glob-archives, and --consider-checkpoints."
"Giving a specific name is incompatible with options --first, --last, -a / --match-archives, and --consider-checkpoints."
)
return self.list(
sort_by=args.sort_by.split(","),
consider_checkpoints=consider_checkpoints,
glob=args.glob_archives,
match=args.match_archives,
first=args.first,
last=args.last,
)

View File

@ -388,3 +388,26 @@ def parse_inclexcl_command(cmd_line_str, fallback=ShellPattern):
val = parse_pattern(remainder_str, fallback, recurse_dir)
return CmdTuple(val, cmd)
def get_regex_from_pattern(pattern: str) -> str:
"""
return a regular expression string corresponding to the given pattern string.
the allowed pattern types are similar to the ones implemented by PatternBase subclasses,
but here we rather do generic string matching, not specialised filesystem paths matching.
"""
if len(pattern) > 2 and pattern[2] == ":" and pattern[:2] in {"sh", "re", "id"}:
(style, pattern) = (pattern[:2], pattern[3:])
else:
(style, pattern) = ("id", pattern) # "identical" match is the default
if style == "sh":
# (?ms) (meaning re.MULTILINE and re.DOTALL) are not desired here.
regex = shellpattern.translate(pattern, match_end="").removeprefix("(?ms)")
elif style == "re":
regex = pattern
elif style == "id":
regex = re.escape(pattern)
else:
raise NotImplementedError
return regex

View File

@ -39,7 +39,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
"check",
"-v",
"--archives-only",
"--glob-archives=archive2",
"--match-archives=archive2",
exit_code=0,
)
self.assert_not_in("archive1", output)

View File

@ -19,7 +19,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd(f"--repo={self.repository_location}", "create", "another_test.2", "input")
self.cmd(f"--repo={self.repository_location}", "extract", "test", "--dry-run")
self.cmd(f"--repo={self.repository_location}", "extract", "test.2", "--dry-run")
self.cmd(f"--repo={self.repository_location}", "delete", "--glob-archives", "another_*")
self.cmd(f"--repo={self.repository_location}", "delete", "--match-archives", "sh:another_*")
self.cmd(f"--repo={self.repository_location}", "delete", "--last", "1")
self.cmd(f"--repo={self.repository_location}", "delete", "-a", "test")
self.cmd(f"--repo={self.repository_location}", "extract", "test.2", "--dry-run")

View File

@ -236,13 +236,13 @@ class ArchiverTestCase(ArchiverTestCaseBase):
assert sorted(os.listdir(os.path.join(mountpoint))) == ["arch11", "arch12"]
with self.fuse_mount(self.repository_location, mountpoint, "--last=2", "--sort=name"):
assert sorted(os.listdir(os.path.join(mountpoint))) == ["arch21", "arch22"]
with self.fuse_mount(self.repository_location, mountpoint, "--glob-archives=arch1*"):
with self.fuse_mount(self.repository_location, mountpoint, "--match-archives=sh:arch1*"):
assert sorted(os.listdir(os.path.join(mountpoint))) == ["arch11", "arch12"]
with self.fuse_mount(self.repository_location, mountpoint, "--glob-archives=arch2*"):
with self.fuse_mount(self.repository_location, mountpoint, "--match-archives=sh:arch2*"):
assert sorted(os.listdir(os.path.join(mountpoint))) == ["arch21", "arch22"]
with self.fuse_mount(self.repository_location, mountpoint, "--glob-archives=arch*"):
with self.fuse_mount(self.repository_location, mountpoint, "--match-archives=sh:arch*"):
assert sorted(os.listdir(os.path.join(mountpoint))) == ["arch11", "arch12", "arch21", "arch22"]
with self.fuse_mount(self.repository_location, mountpoint, "--glob-archives=nope"):
with self.fuse_mount(self.repository_location, mountpoint, "--match-archives=nope"):
assert sorted(os.listdir(os.path.join(mountpoint))) == []
@unittest.skipUnless(llfuse, "llfuse not installed")

View File

@ -188,7 +188,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
"--list",
"--dry-run",
"--keep-daily=1",
"--glob-archives=foo-*",
"--match-archives=sh:foo-*",
)
assert re.search(r"Keeping archive \(rule: daily #1\):\s+foo-2015-08-12-20:00", output)
assert re.search(r"Would prune:\s+foo-2015-08-12-10:00", output)
@ -197,7 +197,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.assert_in("foo-2015-08-12-20:00", output)
self.assert_in("bar-2015-08-12-10:00", output)
self.assert_in("bar-2015-08-12-20:00", output)
self.cmd(f"--repo={self.repository_location}", "prune", "--keep-daily=1", "--glob-archives=foo-*")
self.cmd(f"--repo={self.repository_location}", "prune", "--keep-daily=1", "--match-archives=sh:foo-*")
output = self.cmd(f"--repo={self.repository_location}", "rlist")
self.assert_not_in("foo-2015-08-12-10:00", output)
self.assert_in("foo-2015-08-12-20:00", output)
@ -216,7 +216,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
"--list",
"--dry-run",
"--keep-daily=1",
"--glob-archives=2015-*-foo",
"--match-archives=sh:2015-*-foo",
)
assert re.search(r"Keeping archive \(rule: daily #1\):\s+2015-08-12-20:00-foo", output)
assert re.search(r"Would prune:\s+2015-08-12-10:00-foo", output)
@ -225,7 +225,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.assert_in("2015-08-12-20:00-foo", output)
self.assert_in("2015-08-12-10:00-bar", output)
self.assert_in("2015-08-12-20:00-bar", output)
self.cmd(f"--repo={self.repository_location}", "prune", "--keep-daily=1", "--glob-archives=2015-*-foo")
self.cmd(f"--repo={self.repository_location}", "prune", "--keep-daily=1", "--match-archives=sh:2015-*-foo")
output = self.cmd(f"--repo={self.repository_location}", "rlist")
self.assert_not_in("2015-08-12-10:00-foo", output)
self.assert_in("2015-08-12-20:00-foo", output)

View File

@ -19,7 +19,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd(f"--repo={self.repository_location}", "create", "test-1", src_dir)
self.cmd(f"--repo={self.repository_location}", "create", "something-else-than-test-1", src_dir)
self.cmd(f"--repo={self.repository_location}", "create", "test-2", src_dir)
output = self.cmd(f"--repo={self.repository_location}", "rlist", "--glob-archives=test-*")
output = self.cmd(f"--repo={self.repository_location}", "rlist", "--match-archives=sh:test-*")
self.assert_in("test-1", output)
self.assert_in("test-2", output)
self.assert_not_in("something-else", output)

View File

@ -8,6 +8,7 @@ import pytest
from ..patterns import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern
from ..patterns import load_exclude_file, load_pattern_file
from ..patterns import parse_pattern, PatternMatcher
from ..patterns import get_regex_from_pattern
def check_patterns(files, pattern, expected):
@ -617,3 +618,18 @@ def test_pattern_matcher():
assert pm.match("z") == "B"
assert PatternMatcher(fallback="hey!").fallback == "hey!"
@pytest.mark.parametrize(
"pattern, regex",
[
("foo.bar", r"foo\.bar"), # default is id:
("id:foo.bar", r"foo\.bar"),
("id:foo?", r"foo\?"),
("re:foo.bar", r"foo.bar"),
("re:.*(fooo?|bar|baz).*", r".*(fooo?|bar|baz).*"),
("sh:foo.*", r"foo\.[^\/]*"),
],
)
def test_regex_from_pattern(pattern, regex):
assert get_regex_from_pattern(pattern) == regex