From de439ee8392b3b55a3d7f3777eb859a10682b85a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 Sep 2024 15:54:37 +0200 Subject: [PATCH] analyze: sum up changed chunks per parent directory --- docs/man/borg-analyze.1 | 91 ++++++++++++++ docs/usage.rst | 1 + docs/usage/analyze.rst | 1 + docs/usage/analyze.rst.inc | 84 +++++++++++++ src/borg/archiver/__init__.py | 3 + src/borg/archiver/analyze_cmd.py | 135 +++++++++++++++++++++ src/borg/testsuite/archiver/analyze_cmd.py | 41 +++++++ 7 files changed, 356 insertions(+) create mode 100644 docs/man/borg-analyze.1 create mode 100644 docs/usage/analyze.rst create mode 100644 docs/usage/analyze.rst.inc create mode 100644 src/borg/archiver/analyze_cmd.py create mode 100644 src/borg/testsuite/archiver/analyze_cmd.py diff --git a/docs/man/borg-analyze.1 b/docs/man/borg-analyze.1 new file mode 100644 index 000000000..0cc435411 --- /dev/null +++ b/docs/man/borg-analyze.1 @@ -0,0 +1,91 @@ +.\" Man page generated from reStructuredText. +. +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool" +.SH NAME +borg-analyze \- Analyze archives +.SH SYNOPSIS +.sp +borg [common options] analyze [options] +.SH DESCRIPTION +.sp +Analyze archives to find \(dqhot spots\(dq. +.sp +Borg analyze relies on the usual archive matching options to select the +archives that should be considered for analysis (e.g. \fB\-a series_name\fP). +Then it iterates over all matching archives, over all contained files and +collects information about chunks stored in all directories it encountered. +.sp +It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed +size in the repository easily available) and adds up added/removed chunks\(aq +sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq. +.sp +You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe +some of these are temporary or cache directories you did forget to exclude. +.sp +To not have these unwanted directories in your backups, you could carefully +exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP +to re\-create existing archives without these. +.SH OPTIONS +.sp +See \fIborg\-common(1)\fP for common options of Borg commands. +.SS Archive filters +.INDENT 0.0 +.TP +.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN +only consider archives matching all patterns. see \(dqborg help match\-archives\(dq. +.TP +.BI \-\-sort\-by \ KEYS +Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp +.TP +.BI \-\-first \ N +consider first N archives after other filters were applied +.TP +.BI \-\-last \ N +consider last N archives after other filters were applied +.TP +.BI \-\-oldest \ TIMESPAN +consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. +.TP +.BI \-\-newest \ TIMESPAN +consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m. +.TP +.BI \-\-older \ TIMESPAN +consider archives older than (now \- TIMESPAN), e.g. 7d or 12m. +.TP +.BI \-\-newer \ TIMESPAN +consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m. +.UNINDENT +.SH SEE ALSO +.sp +\fIborg\-common(1)\fP +.SH AUTHOR +The Borg Collective +.\" Generated by docutils manpage writer. +. diff --git a/docs/usage.rst b/docs/usage.rst index 75d0b90de..1cceadb5c 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -57,6 +57,7 @@ Usage usage/delete usage/prune usage/info + usage/analyze usage/mount usage/recreate usage/tar diff --git a/docs/usage/analyze.rst b/docs/usage/analyze.rst new file mode 100644 index 000000000..c3244d917 --- /dev/null +++ b/docs/usage/analyze.rst @@ -0,0 +1 @@ +.. include:: analyze.rst.inc diff --git a/docs/usage/analyze.rst.inc b/docs/usage/analyze.rst.inc new file mode 100644 index 000000000..f8c947438 --- /dev/null +++ b/docs/usage/analyze.rst.inc @@ -0,0 +1,84 @@ +.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit! + +.. _borg_analyze: + +borg analyze +------------ +.. code-block:: none + + borg [common options] analyze [options] + +.. only:: html + + .. class:: borg-options-table + + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | .. class:: borg-common-opt-ref | + | | + | :ref:`common_options` | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | **Archive filters** — Archive filters can be applied to repository targets. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--first N`` | consider first N archives after other filters were applied | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--last N`` | consider last N archives after other filters were applied | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. | + +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ + + .. raw:: html + + + +.. only:: latex + + + + :ref:`common_options` + | + + Archive filters + -a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives". + --sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp + --first N consider first N archives after other filters were applied + --last N consider last N archives after other filters were applied + --oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. + --newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. + --older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m. + --newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. + + +Description +~~~~~~~~~~~ + +Analyze archives to find "hot spots". + +Borg analyze relies on the usual archive matching options to select the +archives that should be considered for analysis (e.g. ``-a series_name``). +Then it iterates over all matching archives, over all contained files and +collects information about chunks stored in all directories it encountered. + +It considers chunk IDs and their plaintext sizes (we don't have the compressed +size in the repository easily available) and adds up added/removed chunks' +sizes per direct parent directory and outputs a list of "directory: size". + +You can use that list to find directories with a lot of "activity" - maybe +some of these are temporary or cache directories you did forget to exclude. + +To not have these unwanted directories in your backups, you could carefully +exclude these in ``borg create`` (for future backups) or use ``borg recreate`` +to re-create existing archives without these. \ No newline at end of file diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index c9e1236fa..b3f0b308c 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -64,6 +64,7 @@ def get_func(args): raise Exception("expected func attributes not found") +from .analyze_cmd import AnalyzeMixIn from .benchmark_cmd import BenchmarkMixIn from .check_cmd import CheckMixIn from .compact_cmd import CompactMixIn @@ -94,6 +95,7 @@ def get_func(args): class Archiver( + AnalyzeMixIn, BenchmarkMixIn, CheckMixIn, CompactMixIn, @@ -332,6 +334,7 @@ def build_parser(self): subparsers = parser.add_subparsers(title="required arguments", metavar="") + self.build_parser_analyze(subparsers, common_parser, mid_common_parser) self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser) self.build_parser_check(subparsers, common_parser, mid_common_parser) self.build_parser_compact(subparsers, common_parser, mid_common_parser) diff --git a/src/borg/archiver/analyze_cmd.py b/src/borg/archiver/analyze_cmd.py new file mode 100644 index 000000000..a378c2de0 --- /dev/null +++ b/src/borg/archiver/analyze_cmd.py @@ -0,0 +1,135 @@ +import argparse +from collections import defaultdict +import os + +from ._common import with_repository, define_archive_filters_group +from ..archive import Archive +from ..constants import * # NOQA +from ..helpers import bin_to_hex, Error +from ..helpers import ProgressIndicatorPercent +from ..manifest import Manifest +from ..remote import RemoteRepository +from ..repository import Repository + +from ..logger import create_logger + +logger = create_logger() + + +class ArchiveAnalyzer: + def __init__(self, args, repository, manifest): + self.args = args + self.repository = repository + assert isinstance(repository, (Repository, RemoteRepository)) + self.manifest = manifest + self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed + + def analyze(self): + logger.info("Starting archives analysis...") + self.analyze_archives() + self.report() + logger.info("Finished archives analysis.") + + def analyze_archives(self) -> None: + """Analyze all archives matching the given selection criteria.""" + archive_infos = self.manifest.archives.list_considering(self.args) + num_archives = len(archive_infos) + if num_archives < 2: + raise Error("Need at least 2 archives to analyze.") + + pi = ProgressIndicatorPercent( + total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives" + ) + i = 0 + info = archive_infos[i] + pi.show(i) + logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})") + base = self.analyze_archive(info.id) + for i, info in enumerate(archive_infos[1:]): + pi.show(i + 1) + logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})") + new = self.analyze_archive(info.id) + self.analyze_change(base, new) + base = new + pi.finish() + + def analyze_archive(self, id): + """compute the set of chunks for each directory in this archive""" + archive = Archive(self.manifest, id) + chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path + for item in archive.iter_items(): + if "chunks" in item: + item_chunks = dict(item.chunks) # chunk id -> plaintext size + directory_path = os.path.dirname(item.path) + chunks_by_path[directory_path].update(item_chunks) + return chunks_by_path + + def analyze_change(self, base, new): + """for each directory path, sum up the changed (removed or added) chunks' sizes between base and new.""" + + def analyze_path_change(path): + base_chunks = base[path] + new_chunks = new[path] + # add up added chunks' sizes + for id in new_chunks.keys() - base_chunks.keys(): + self.difference_by_path[directory_path] += new_chunks[id] + # add up removed chunks' sizes + for id in base_chunks.keys() - new_chunks.keys(): + self.difference_by_path[directory_path] += base_chunks[id] + + for directory_path in base: + analyze_path_change(directory_path) + for directory_path in new: + if directory_path not in base: + analyze_path_change(directory_path) + + def report(self): + print() + print("chunks added or removed by directory path") + print("=========================================") + for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True): + difference = self.difference_by_path[directory_path] + print(f"{directory_path}: {difference}") + + +class AnalyzeMixIn: + @with_repository(compatibility=(Manifest.Operation.READ,)) + def do_analyze(self, args, repository, manifest): + """Analyze archives""" + ArchiveAnalyzer(args, repository, manifest).analyze() + + def build_parser_analyze(self, subparsers, common_parser, mid_common_parser): + from ._common import process_epilog + + analyze_epilog = process_epilog( + """ + Analyze archives to find "hot spots". + + Borg analyze relies on the usual archive matching options to select the + archives that should be considered for analysis (e.g. ``-a series_name``). + Then it iterates over all matching archives, over all contained files and + collects information about chunks stored in all directories it encountered. + + It considers chunk IDs and their plaintext sizes (we don't have the compressed + size in the repository easily available) and adds up added/removed chunks' + sizes per direct parent directory and outputs a list of "directory: size". + + You can use that list to find directories with a lot of "activity" - maybe + some of these are temporary or cache directories you did forget to exclude. + + To not have these unwanted directories in your backups, you could carefully + exclude these in ``borg create`` (for future backups) or use ``borg recreate`` + to re-create existing archives without these. + """ + ) + subparser = subparsers.add_parser( + "analyze", + parents=[common_parser], + add_help=False, + description=self.do_analyze.__doc__, + epilog=analyze_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help="analyze archives", + ) + subparser.set_defaults(func=self.do_analyze) + define_archive_filters_group(subparser) diff --git a/src/borg/testsuite/archiver/analyze_cmd.py b/src/borg/testsuite/archiver/analyze_cmd.py new file mode 100644 index 000000000..eb6f6463c --- /dev/null +++ b/src/borg/testsuite/archiver/analyze_cmd.py @@ -0,0 +1,41 @@ +import pathlib + +from ...constants import * # NOQA +from . import cmd, generate_archiver_tests, RK_ENCRYPTION + +pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA + + +def test_analyze(archivers, request): + def create_archive(): + cmd(archiver, "create", "archive", archiver.input_path) + + def analyze_archives(): + return cmd(archiver, "analyze", "-a", "archive") + + archiver = request.getfixturevalue(archivers) + + cmd(archiver, "repo-create", RK_ENCRYPTION) + input_path = pathlib.Path(archiver.input_path) + + # 1st archive + (input_path / "file1").write_text("1") + create_archive() + + # 2nd archive + (input_path / "file2").write_text("22") + create_archive() + + assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path + + # 3rd archive + (input_path / "file3").write_text("333") + create_archive() + + assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path + + # 4th archive + (input_path / "file2").unlink() + create_archive() + + assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1