Merge pull request #8436 from ThomasWaldmann/analyze-cmd

analyze: changed chunks per directory
2025-01-01 12:45:34 +00:00 · 2024-10-02 17:22:08 +02:00 · 2024-10-02 17:22:08 +02:00 · 8cd951f324
commit 8cd951f324
parent 5a87b41e37 de439ee839
7 changed files with 356 additions and 0 deletions
--- a/docs/man/borg-analyze.1
+++ b/docs/man/borg-analyze.1
@ -0,0 +1,91 @@
 .\" Man page generated from reStructuredText.
 .
 .
 .nr rst2man-indent-level 0
 .
 .de1 rstReportMargin
 \\$1 \\n[an-margin]
 level \\n[rst2man-indent-level]
 level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 -
 \\n[rst2man-indent0]
 \\n[rst2man-indent1]
 \\n[rst2man-indent2]
 ..
 .de1 INDENT
 .\" .rstReportMargin pre:
 . RS \\$1
 . nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
 . nr rst2man-indent-level +1
 .\" .rstReportMargin post:
 ..
 .de UNINDENT
 . RE
 .\" indent \\n[an-margin]
 .\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .nr rst2man-indent-level -1
 .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
 ..
 .TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
 .SH NAME
 borg-analyze \- Analyze archives
 .SH SYNOPSIS
 .sp
 borg [common options] analyze [options]
 .SH DESCRIPTION
 .sp
 Analyze archives to find \(dqhot spots\(dq.
 .sp
 Borg analyze relies on the usual archive matching options to select the
 archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
 Then it iterates over all matching archives, over all contained files and
 collects information about chunks stored in all directories it encountered.
 .sp
 It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
 size in the repository easily available) and adds up added/removed chunks\(aq
 sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
 .sp
 You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
 some of these are temporary or cache directories you did forget to exclude.
 .sp
 To not have these unwanted directories in your backups, you could carefully
 exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
 to re\-create existing archives without these.
 .SH OPTIONS
 .sp
 See \fIborg\-common(1)\fP for common options of Borg commands.
 .SS Archive filters
 .INDENT 0.0
 .TP
 .BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
 only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
 .TP
 .BI \-\-sort\-by \ KEYS
 Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
 .TP
 .BI \-\-first \ N
 consider first N archives after other filters were applied
 .TP
 .BI \-\-last \ N
 consider last N archives after other filters were applied
 .TP
 .BI \-\-oldest \ TIMESPAN
 consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
 .TP
 .BI \-\-newest \ TIMESPAN
 consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
 .TP
 .BI \-\-older \ TIMESPAN
 consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
 .TP
 .BI \-\-newer \ TIMESPAN
 consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
 .UNINDENT
 .SH SEE ALSO
 .sp
 \fIborg\-common(1)\fP
 .SH AUTHOR
 The Borg Collective
 .\" Generated by docutils manpage writer.
 .
--- a/docs/usage.rst
+++ b/docs/usage.rst
@ -57,6 +57,7 @@ Usage
   usage/delete
   usage/prune
   usage/info
   usage/analyze
   usage/mount
   usage/recreate
   usage/tar
--- a/docs/usage/analyze.rst
+++ b/docs/usage/analyze.rst
@ -0,0 +1 @@
 .. include:: analyze.rst.inc
--- a/docs/usage/analyze.rst.inc
+++ b/docs/usage/analyze.rst.inc
@ -0,0 +1,84 @@
 .. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
 .. _borg_analyze:
 borg analyze
 ------------
 .. code-block:: none
    borg [common options] analyze [options]
 .. only:: html
    .. class:: borg-options-table
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    | .. class:: borg-common-opt-ref                                                                                                                                                                                                                           |
    |                                                                                                                                                                                                                                                          |
    | :ref:`common_options`                                                                                                                                                                                                                                    |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    | **Archive filters** — Archive filters can be applied to repository targets.                                                                                                                                                                              |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives".                                               |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--sort-by KEYS``                           | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--first N``                                | consider first N archives after other filters were applied                                                                  |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--last N``                                 | consider last N archives after other filters were applied                                                                   |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--oldest TIMESPAN``                        | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.                           |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--newest TIMESPAN``                        | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.                           |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--older TIMESPAN``                         | consider archives older than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    |                                                                             | ``--newer TIMESPAN``                         | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.                                                              |
    +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
    .. raw:: html
        <script type='text/javascript'>
        $(document).ready(function () {
            $('.borg-options-table colgroup').remove();
        })
        </script>
 .. only:: latex
    :ref:`common_options`
        |
    Archive filters
        -a PATTERN, --match-archives PATTERN     only consider archives matching all patterns. see "borg help match-archives".
        --sort-by KEYS                           Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
        --first N                                consider first N archives after other filters were applied
        --last N                                 consider last N archives after other filters were applied
        --oldest TIMESPAN                        consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
        --newest TIMESPAN                        consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
        --older TIMESPAN                         consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
        --newer TIMESPAN                         consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
 Description
 ~~~~~~~~~~~
 Analyze archives to find "hot spots".
 Borg analyze relies on the usual archive matching options to select the
 archives that should be considered for analysis (e.g. ``-a series_name``).
 Then it iterates over all matching archives, over all contained files and
 collects information about chunks stored in all directories it encountered.
 It considers chunk IDs and their plaintext sizes (we don't have the compressed
 size in the repository easily available) and adds up added/removed chunks'
 sizes per direct parent directory and outputs a list of "directory: size".
 You can use that list to find directories with a lot of "activity" - maybe
 some of these are temporary or cache directories you did forget to exclude.
 To not have these unwanted directories in your backups, you could carefully
 exclude these in ``borg create`` (for future backups) or use ``borg recreate``
 to re-create existing archives without these.
--- a/src/borg/archiver/init.py
+++ b/src/borg/archiver/init.py
@ -64,6 +64,7 @@ def get_func(args):
    raise Exception("expected func attributes not found")
 from .analyze_cmd import AnalyzeMixIn
 from .benchmark_cmd import BenchmarkMixIn
 from .check_cmd import CheckMixIn
 from .compact_cmd import CompactMixIn
@ -94,6 +95,7 @@ def get_func(args):
 class Archiver(
    AnalyzeMixIn,
    BenchmarkMixIn,
    CheckMixIn,
    CompactMixIn,
@ -332,6 +334,7 @@ def build_parser(self):
        subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
        self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
        self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
        self.build_parser_check(subparsers, common_parser, mid_common_parser)
        self.build_parser_compact(subparsers, common_parser, mid_common_parser)
--- a/src/borg/archiver/analyze_cmd.py
+++ b/src/borg/archiver/analyze_cmd.py
@ -0,0 +1,135 @@
 import argparse
 from collections import defaultdict
 import os
 from ._common import with_repository, define_archive_filters_group
 from ..archive import Archive
 from ..constants import *  # NOQA
 from ..helpers import bin_to_hex, Error
 from ..helpers import ProgressIndicatorPercent
 from ..manifest import Manifest
 from ..remote import RemoteRepository
 from ..repository import Repository
 from ..logger import create_logger
 logger = create_logger()
 class ArchiveAnalyzer:
    def __init__(self, args, repository, manifest):
        self.args = args
        self.repository = repository
        assert isinstance(repository, (Repository, RemoteRepository))
        self.manifest = manifest
        self.difference_by_path = defaultdict(int)  # directory path -> count of chunks changed
    def analyze(self):
        logger.info("Starting archives analysis...")
        self.analyze_archives()
        self.report()
        logger.info("Finished archives analysis.")
    def analyze_archives(self) -> None:
        """Analyze all archives matching the given selection criteria."""
        archive_infos = self.manifest.archives.list_considering(self.args)
        num_archives = len(archive_infos)
        if num_archives < 2:
            raise Error("Need at least 2 archives to analyze.")
        pi = ProgressIndicatorPercent(
            total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
        )
        i = 0
        info = archive_infos[i]
        pi.show(i)
        logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
        base = self.analyze_archive(info.id)
        for i, info in enumerate(archive_infos[1:]):
            pi.show(i + 1)
            logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
            new = self.analyze_archive(info.id)
            self.analyze_change(base, new)
            base = new
        pi.finish()
    def analyze_archive(self, id):
        """compute the set of chunks for each directory in this archive"""
        archive = Archive(self.manifest, id)
        chunks_by_path = defaultdict(dict)  # collect all chunk IDs generated from files in this directory path
        for item in archive.iter_items():
            if "chunks" in item:
                item_chunks = dict(item.chunks)  # chunk id -> plaintext size
                directory_path = os.path.dirname(item.path)
                chunks_by_path[directory_path].update(item_chunks)
        return chunks_by_path
    def analyze_change(self, base, new):
        """for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
        def analyze_path_change(path):
            base_chunks = base[path]
            new_chunks = new[path]
            # add up added chunks' sizes
            for id in new_chunks.keys() - base_chunks.keys():
                self.difference_by_path[directory_path] += new_chunks[id]
            # add up removed chunks' sizes
            for id in base_chunks.keys() - new_chunks.keys():
                self.difference_by_path[directory_path] += base_chunks[id]
        for directory_path in base:
            analyze_path_change(directory_path)
        for directory_path in new:
            if directory_path not in base:
                analyze_path_change(directory_path)
    def report(self):
        print()
        print("chunks added or removed by directory path")
        print("=========================================")
        for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
            difference = self.difference_by_path[directory_path]
            print(f"{directory_path}: {difference}")
 class AnalyzeMixIn:
    @with_repository(compatibility=(Manifest.Operation.READ,))
    def do_analyze(self, args, repository, manifest):
        """Analyze archives"""
        ArchiveAnalyzer(args, repository, manifest).analyze()
    def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
        from ._common import process_epilog
        analyze_epilog = process_epilog(
            """
            Analyze archives to find "hot spots".
            Borg analyze relies on the usual archive matching options to select the
            archives that should be considered for analysis (e.g. ``-a series_name``).
            Then it iterates over all matching archives, over all contained files and
            collects information about chunks stored in all directories it encountered.
            It considers chunk IDs and their plaintext sizes (we don't have the compressed
            size in the repository easily available) and adds up added/removed chunks'
            sizes per direct parent directory and outputs a list of "directory: size".
            You can use that list to find directories with a lot of "activity" - maybe
            some of these are temporary or cache directories you did forget to exclude.
            To not have these unwanted directories in your backups, you could carefully
            exclude these in ``borg create`` (for future backups) or use ``borg recreate``
            to re-create existing archives without these.
            """
        )
        subparser = subparsers.add_parser(
            "analyze",
            parents=[common_parser],
            add_help=False,
            description=self.do_analyze.__doc__,
            epilog=analyze_epilog,
            formatter_class=argparse.RawDescriptionHelpFormatter,
            help="analyze archives",
        )
        subparser.set_defaults(func=self.do_analyze)
        define_archive_filters_group(subparser)
--- a/src/borg/testsuite/archiver/analyze_cmd.py
+++ b/src/borg/testsuite/archiver/analyze_cmd.py
@ -0,0 +1,41 @@
 import pathlib
 from ...constants import *  # NOQA
 from . import cmd, generate_archiver_tests, RK_ENCRYPTION
 pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local")  # NOQA
 def test_analyze(archivers, request):
    def create_archive():
        cmd(archiver, "create", "archive", archiver.input_path)
    def analyze_archives():
        return cmd(archiver, "analyze", "-a", "archive")
    archiver = request.getfixturevalue(archivers)
    cmd(archiver, "repo-create", RK_ENCRYPTION)
    input_path = pathlib.Path(archiver.input_path)
    # 1st archive
    (input_path / "file1").write_text("1")
    create_archive()
    # 2nd archive
    (input_path / "file2").write_text("22")
    create_archive()
    assert "/input: 2" in analyze_archives()  # 2nd archive added 1 chunk for input path
    # 3rd archive
    (input_path / "file3").write_text("333")
    create_archive()
    assert "/input: 5" in analyze_archives()  # 2nd/3rd archives added 2 chunks for input path
    # 4th archive
    (input_path / "file2").unlink()
    create_archive()
    assert "/input: 7" in analyze_archives()  # 2nd/3rd archives added 2, 4th archive removed 1