borg/src/borg/archiver/prune_cmd.py

366 lines
15 KiB
Python

import argparse
from collections import OrderedDict
from datetime import datetime, timezone, timedelta
import logging
from operator import attrgetter
import os
import re
from ._common import with_repository, Highlander
from ..archive import Archive, Statistics
from ..cache import Cache
from ..constants import * # NOQA
from ..helpers import ArchiveFormatter, interval, sig_int, log_multi, ProgressIndicatorPercent, CommandError, Error
from ..manifest import Manifest
from ..logger import create_logger
logger = create_logger()
def prune_within(archives, hours, kept_because):
target = datetime.now(timezone.utc) - timedelta(seconds=hours * 3600)
kept_counter = 0
result = []
for a in archives:
if a.ts > target:
kept_counter += 1
kept_because[a.id] = ("within", kept_counter)
result.append(a)
return result
PRUNING_PATTERNS = OrderedDict(
[
("secondly", "%Y-%m-%d %H:%M:%S"),
("minutely", "%Y-%m-%d %H:%M"),
("hourly", "%Y-%m-%d %H"),
("daily", "%Y-%m-%d"),
("weekly", "%G-%V"),
("monthly", "%Y-%m"),
("yearly", "%Y"),
]
)
def prune_split(archives, rule, n, kept_because=None):
last = None
keep = []
pattern = PRUNING_PATTERNS[rule]
if kept_because is None:
kept_because = {}
if n == 0:
return keep
a = None
for a in sorted(archives, key=attrgetter("ts"), reverse=True):
# we compute the pruning in local time zone
period = a.ts.astimezone().strftime(pattern)
if period != last:
last = period
if a.id not in kept_because:
keep.append(a)
kept_because[a.id] = (rule, len(keep))
if len(keep) == n:
break
# Keep oldest archive if we didn't reach the target retention count
if a is not None and len(keep) < n and a.id not in kept_because:
keep.append(a)
kept_because[a.id] = (rule + "[oldest]", len(keep))
return keep
class PruneMixIn:
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
def do_prune(self, args, repository, manifest):
"""Prune repository archives according to specified rules"""
if not any(
(args.secondly, args.minutely, args.hourly, args.daily, args.weekly, args.monthly, args.yearly, args.within)
):
raise CommandError(
'At least one of the "keep-within", "keep-last", '
'"keep-secondly", "keep-minutely", "keep-hourly", "keep-daily", '
'"keep-weekly", "keep-monthly" or "keep-yearly" settings must be specified.'
)
if args.format is not None:
format = args.format
elif args.short:
format = "{archive}"
else:
format = os.environ.get("BORG_PRUNE_FORMAT", "{archive:<36} {time} [{id}]")
formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec)
checkpoint_re = r"\.checkpoint(\.\d+)?"
archives_checkpoints = manifest.archives.list(
match=args.match_archives,
consider_checkpoints=True,
match_end=r"(%s)?\Z" % checkpoint_re,
sort_by=["ts"],
reverse=True,
)
is_checkpoint = re.compile(r"(%s)\Z" % checkpoint_re).search
checkpoints = [arch for arch in archives_checkpoints if is_checkpoint(arch.name)]
# keep the latest checkpoint, if there is no later non-checkpoint archive
if archives_checkpoints and checkpoints and archives_checkpoints[0] is checkpoints[0]:
keep_checkpoints = checkpoints[:1]
else:
keep_checkpoints = []
checkpoints = set(checkpoints)
# ignore all checkpoint archives to avoid keeping one (which is an incomplete backup)
# that is newer than a successfully completed backup - and killing the successful backup.
archives = [arch for arch in archives_checkpoints if arch not in checkpoints]
keep = []
# collect the rule responsible for the keeping of each archive in this dict
# keys are archive ids, values are a tuple
# (<rulename>, <how many archives were kept by this rule so far >)
kept_because = {}
# find archives which need to be kept because of the keep-within rule
if args.within:
keep += prune_within(archives, args.within, kept_because)
# find archives which need to be kept because of the various time period rules
for rule in PRUNING_PATTERNS.keys():
num = getattr(args, rule, None)
if num is not None:
keep += prune_split(archives, rule, num, kept_because)
to_delete = (set(archives) | checkpoints) - (set(keep) | set(keep_checkpoints))
stats = Statistics(iec=args.iec)
with Cache(repository, manifest, lock_wait=self.lock_wait, iec=args.iec) as cache:
def checkpoint_func():
manifest.write()
repository.commit(compact=False)
cache.commit()
list_logger = logging.getLogger("borg.output.list")
# set up counters for the progress display
to_delete_len = len(to_delete)
archives_deleted = 0
uncommitted_deletes = 0
pi = ProgressIndicatorPercent(total=len(to_delete), msg="Pruning archives %3.0f%%", msgid="prune")
for archive in archives_checkpoints:
if sig_int and sig_int.action_done():
break
if archive in to_delete:
pi.show()
if args.dry_run:
log_message = "Would prune:"
else:
archives_deleted += 1
log_message = "Pruning archive (%d/%d):" % (archives_deleted, to_delete_len)
archive = Archive(manifest, archive.name, cache)
archive.delete(stats, forced=args.forced)
checkpointed = self.maybe_checkpoint(
checkpoint_func=checkpoint_func, checkpoint_interval=args.checkpoint_interval
)
uncommitted_deletes = 0 if checkpointed else (uncommitted_deletes + 1)
else:
if is_checkpoint(archive.name):
log_message = "Keeping checkpoint archive:"
else:
log_message = "Keeping archive (rule: {rule} #{num}):".format(
rule=kept_because[archive.id][0], num=kept_because[archive.id][1]
)
if (
args.output_list
or (args.list_pruned and archive in to_delete)
or (args.list_kept and archive not in to_delete)
):
list_logger.info(f"{log_message:<44} {formatter.format_item(archive, jsonline=False)}")
pi.finish()
if sig_int:
# Ctrl-C / SIGINT: do not checkpoint (commit) again, we already have a checkpoint in this case.
raise Error("Got Ctrl-C / SIGINT.")
elif uncommitted_deletes > 0:
checkpoint_func()
if args.stats:
log_multi(str(stats), logger=logging.getLogger("borg.output.stats"))
def build_parser_prune(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog
from ._common import define_archive_filters_group
prune_epilog = process_epilog(
"""
The prune command prunes a repository by deleting all archives not matching
any of the specified retention options.
Important: Repository disk space is **not** freed until you run ``borg compact``.
This command is normally used by automated backup scripts wanting to keep a
certain number of historic backups. This retention policy is commonly referred to as
`GFS <https://en.wikipedia.org/wiki/Backup_rotation_scheme#Grandfather-father-son>`_
(Grandfather-father-son) backup rotation scheme.
Also, prune automatically removes checkpoint archives (incomplete archives left
behind by interrupted backup runs) except if the checkpoint is the latest
archive (and thus still needed). Checkpoint archives are not considered when
comparing archive counts against the retention limits (``--keep-X``).
If you use --match-archives (-a), then only archives that match the pattern are
considered for deletion and only those archives count towards the totals
specified by the rules.
Otherwise, *all* archives in the repository are candidates for deletion!
There is no automatic distinction between archives representing different
contents. These need to be distinguished by specifying matching globs.
If you have multiple sequences of archives with different data sets (e.g.
from different machines) in one shared repository, use one prune call per
data set that matches only the respective archives using the --match-archives
(-a) option.
The ``--keep-within`` option takes an argument of the form "<int><char>",
where char is "H", "d", "w", "m", "y". For example, ``--keep-within 2d`` means
to keep all archives that were created within the past 48 hours.
"1m" is taken to mean "31d". The archives kept with this option do not
count towards the totals specified by any other options.
A good procedure is to thin out more and more the older your backups get.
As an example, ``--keep-daily 7`` means to keep the latest backup on each day,
up to 7 most recent days with backups (days without backups do not count).
The rules are applied from secondly to yearly, and backups selected by previous
rules do not count towards those of later rules. The time that each backup
starts is used for pruning purposes. Dates and times are interpreted in the local
timezone of the system where borg prune runs, and weeks go from Monday to Sunday.
Specifying a negative number of archives to keep means that there is no limit.
As of borg 1.2.0, borg will retain the oldest archive if any of the secondly,
minutely, hourly, daily, weekly, monthly, or yearly rules was not otherwise able to
meet its retention target. This enables the first chronological archive to continue
aging until it is replaced by a newer archive that meets the retention criteria.
The ``--keep-last N`` option is doing the same as ``--keep-secondly N`` (and it will
keep the last N archives under the assumption that you do not create more than one
backup archive in the same second).
When using ``--stats``, you will get some statistics about how much data was
deleted - the "Deleted data" deduplicated size there is most interesting as
that is how much your repository will shrink.
Please note that the "All archives" stats refer to the state after pruning.
You can influence how the ``--list`` output is formatted by using the ``--short``
option (less wide output) or by giving a custom format using ``--format`` (see
the ``borg rlist`` description for more details about the format string).
"""
)
subparser = subparsers.add_parser(
"prune",
parents=[common_parser],
add_help=False,
description=self.do_prune.__doc__,
epilog=prune_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help="prune archives",
)
subparser.set_defaults(func=self.do_prune)
subparser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", help="do not change repository")
subparser.add_argument(
"--force",
dest="forced",
action="store_true",
help="force pruning of corrupted archives, " "use ``--force --force`` in case ``--force`` does not work.",
)
subparser.add_argument(
"-s", "--stats", dest="stats", action="store_true", help="print statistics for the deleted archive"
)
subparser.add_argument(
"--list", dest="output_list", action="store_true", help="output verbose list of archives it keeps/prunes"
)
subparser.add_argument("--short", dest="short", action="store_true", help="use a less wide archive part format")
subparser.add_argument(
"--list-pruned", dest="list_pruned", action="store_true", help="output verbose list of archives it prunes"
)
subparser.add_argument(
"--list-kept", dest="list_kept", action="store_true", help="output verbose list of archives it keeps"
)
subparser.add_argument(
"--format",
metavar="FORMAT",
dest="format",
action=Highlander,
help="specify format for the archive part " '(default: "{archive:<36} {time} [{id}]")',
)
subparser.add_argument(
"--keep-within",
metavar="INTERVAL",
dest="within",
type=interval,
action=Highlander,
help="keep all archives within this time interval",
)
subparser.add_argument(
"--keep-last",
"--keep-secondly",
dest="secondly",
type=int,
default=0,
action=Highlander,
help="number of secondly archives to keep",
)
subparser.add_argument(
"--keep-minutely",
dest="minutely",
type=int,
default=0,
action=Highlander,
help="number of minutely archives to keep",
)
subparser.add_argument(
"-H",
"--keep-hourly",
dest="hourly",
type=int,
default=0,
action=Highlander,
help="number of hourly archives to keep",
)
subparser.add_argument(
"-d",
"--keep-daily",
dest="daily",
type=int,
default=0,
action=Highlander,
help="number of daily archives to keep",
)
subparser.add_argument(
"-w",
"--keep-weekly",
dest="weekly",
type=int,
default=0,
action=Highlander,
help="number of weekly archives to keep",
)
subparser.add_argument(
"-m",
"--keep-monthly",
dest="monthly",
type=int,
default=0,
action=Highlander,
help="number of monthly archives to keep",
)
subparser.add_argument(
"-y",
"--keep-yearly",
dest="yearly",
type=int,
default=0,
action=Highlander,
help="number of yearly archives to keep",
)
define_archive_filters_group(subparser, sort_by=False, first_last=False)
subparser.add_argument(
"-c",
"--checkpoint-interval",
metavar="SECONDS",
dest="checkpoint_interval",
type=int,
default=1800,
action=Highlander,
help="write checkpoint every SECONDS seconds (Default: 1800)",
)