diff --git a/docs/man/borg-analyze.1 b/docs/man/borg-analyze.1
new file mode 100644
index 000000000..0cc435411
--- /dev/null
+++ b/docs/man/borg-analyze.1
@@ -0,0 +1,91 @@
+.\" Man page generated from reStructuredText.
+.nr rst2man-indent-level 0
+.de1 rstReportMargin
+\\$1 \\n[an-margin]
+level \\n[rst2man-indent-level]
+level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.de1 INDENT
+.\" .rstReportMargin pre:
+. RS \\$1
+. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
+. nr rst2man-indent-level +1
+.\" .rstReportMargin post:
+. RE
+.\" indent \\n[an-margin]
+.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.nr rst2man-indent-level -1
+.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
+.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
+.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
+borg-analyze \- Analyze archives
+borg [common options] analyze [options]
+Analyze archives to find \(dqhot spots\(dq.
+Borg analyze relies on the usual archive matching options to select the
+archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
+Then it iterates over all matching archives, over all contained files and
+collects information about chunks stored in all directories it encountered.
+It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
+size in the repository easily available) and adds up added/removed chunks\(aq
+sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
+You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
+some of these are temporary or cache directories you did forget to exclude.
+To not have these unwanted directories in your backups, you could carefully
+exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
+to re\-create existing archives without these.
+See \fIborg\-common(1)\fP for common options of Borg commands.
+.SS Archive filters
+.INDENT 0.0
+.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
+only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
+.BI \-\-sort\-by \ KEYS
+Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
+.BI \-\-first \ N
+consider first N archives after other filters were applied
+.BI \-\-last \ N
+consider last N archives after other filters were applied
+.BI \-\-oldest \ TIMESPAN
+consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
+.BI \-\-newest \ TIMESPAN
+consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
+.BI \-\-older \ TIMESPAN
+consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
+.BI \-\-newer \ TIMESPAN
+consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
+The Borg Collective
+.\" Generated by docutils manpage writer.
diff --git a/docs/usage.rst b/docs/usage.rst
index 75d0b90de..1cceadb5c 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -57,6 +57,7 @@ Usage
+ usage/analyze
diff --git a/docs/usage/analyze.rst b/docs/usage/analyze.rst
new file mode 100644
index 000000000..c3244d917
--- /dev/null
+++ b/docs/usage/analyze.rst
@@ -0,0 +1 @@
+.. include:: analyze.rst.inc
diff --git a/docs/usage/analyze.rst.inc b/docs/usage/analyze.rst.inc
new file mode 100644
index 000000000..f8c947438
--- /dev/null
+++ b/docs/usage/analyze.rst.inc
@@ -0,0 +1,84 @@
+.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
+.. _borg_analyze:
+borg analyze
+.. code-block:: none
+ borg [common options] analyze [options]
+.. only:: html
+ .. class:: borg-options-table
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | .. class:: borg-common-opt-ref |
+ | |
+ | :ref:`common_options` |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | **Archive filters** — Archive filters can be applied to repository targets. |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--first N`` | consider first N archives after other filters were applied |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--last N`` | consider last N archives after other filters were applied |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ | | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. |
+ +-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
+ .. raw:: html
+.. only:: latex
+ :ref:`common_options`
+ |
+ Archive filters
+ -a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives".
+ --sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
+ --first N consider first N archives after other filters were applied
+ --last N consider last N archives after other filters were applied
+ --oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
+ --newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
+ --older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
+ --newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
+Analyze archives to find "hot spots".
+Borg analyze relies on the usual archive matching options to select the
+archives that should be considered for analysis (e.g. ``-a series_name``).
+Then it iterates over all matching archives, over all contained files and
+collects information about chunks stored in all directories it encountered.
+It considers chunk IDs and their plaintext sizes (we don't have the compressed
+size in the repository easily available) and adds up added/removed chunks'
+sizes per direct parent directory and outputs a list of "directory: size".
+You can use that list to find directories with a lot of "activity" - maybe
+some of these are temporary or cache directories you did forget to exclude.
+To not have these unwanted directories in your backups, you could carefully
+exclude these in ``borg create`` (for future backups) or use ``borg recreate``
+to re-create existing archives without these.
\ No newline at end of file
diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py
index c9e1236fa..b3f0b308c 100644
--- a/src/borg/archiver/__init__.py
+++ b/src/borg/archiver/__init__.py
@@ -64,6 +64,7 @@ def get_func(args):
raise Exception("expected func attributes not found")
+from .analyze_cmd import AnalyzeMixIn
from .benchmark_cmd import BenchmarkMixIn
from .check_cmd import CheckMixIn
from .compact_cmd import CompactMixIn
@@ -94,6 +95,7 @@ def get_func(args):
class Archiver(
+ AnalyzeMixIn,
@@ -332,6 +334,7 @@ def build_parser(self):
subparsers = parser.add_subparsers(title="required arguments", metavar="")
+ self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
self.build_parser_check(subparsers, common_parser, mid_common_parser)
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
diff --git a/src/borg/archiver/analyze_cmd.py b/src/borg/archiver/analyze_cmd.py
new file mode 100644
index 000000000..a378c2de0
--- /dev/null
+++ b/src/borg/archiver/analyze_cmd.py
@@ -0,0 +1,135 @@
+import argparse
+from collections import defaultdict
+import os
+from ._common import with_repository, define_archive_filters_group
+from ..archive import Archive
+from ..constants import * # NOQA
+from ..helpers import bin_to_hex, Error
+from ..helpers import ProgressIndicatorPercent
+from ..manifest import Manifest
+from ..remote import RemoteRepository
+from ..repository import Repository
+from ..logger import create_logger
+logger = create_logger()
+class ArchiveAnalyzer:
+ def __init__(self, args, repository, manifest):
+ self.args = args
+ self.repository = repository
+ assert isinstance(repository, (Repository, RemoteRepository))
+ self.manifest = manifest
+ self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed
+ def analyze(self):
+ logger.info("Starting archives analysis...")
+ self.analyze_archives()
+ self.report()
+ logger.info("Finished archives analysis.")
+ def analyze_archives(self) -> None:
+ """Analyze all archives matching the given selection criteria."""
+ archive_infos = self.manifest.archives.list_considering(self.args)
+ num_archives = len(archive_infos)
+ if num_archives < 2:
+ raise Error("Need at least 2 archives to analyze.")
+ pi = ProgressIndicatorPercent(
+ total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
+ )
+ i = 0
+ info = archive_infos[i]
+ pi.show(i)
+ logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
+ base = self.analyze_archive(info.id)
+ for i, info in enumerate(archive_infos[1:]):
+ pi.show(i + 1)
+ logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
+ new = self.analyze_archive(info.id)
+ self.analyze_change(base, new)
+ base = new
+ pi.finish()
+ def analyze_archive(self, id):
+ """compute the set of chunks for each directory in this archive"""
+ archive = Archive(self.manifest, id)
+ chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
+ for item in archive.iter_items():
+ if "chunks" in item:
+ item_chunks = dict(item.chunks) # chunk id -> plaintext size
+ directory_path = os.path.dirname(item.path)
+ chunks_by_path[directory_path].update(item_chunks)
+ return chunks_by_path
+ def analyze_change(self, base, new):
+ """for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
+ def analyze_path_change(path):
+ base_chunks = base[path]
+ new_chunks = new[path]
+ # add up added chunks' sizes
+ for id in new_chunks.keys() - base_chunks.keys():
+ self.difference_by_path[directory_path] += new_chunks[id]
+ # add up removed chunks' sizes
+ for id in base_chunks.keys() - new_chunks.keys():
+ self.difference_by_path[directory_path] += base_chunks[id]
+ for directory_path in base:
+ analyze_path_change(directory_path)
+ for directory_path in new:
+ if directory_path not in base:
+ analyze_path_change(directory_path)
+ def report(self):
+ print()
+ print("chunks added or removed by directory path")
+ print("=========================================")
+ for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
+ difference = self.difference_by_path[directory_path]
+ print(f"{directory_path}: {difference}")
+class AnalyzeMixIn:
+ @with_repository(compatibility=(Manifest.Operation.READ,))
+ def do_analyze(self, args, repository, manifest):
+ """Analyze archives"""
+ ArchiveAnalyzer(args, repository, manifest).analyze()
+ def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
+ from ._common import process_epilog
+ analyze_epilog = process_epilog(
+ """
+ Analyze archives to find "hot spots".
+ Borg analyze relies on the usual archive matching options to select the
+ archives that should be considered for analysis (e.g. ``-a series_name``).
+ Then it iterates over all matching archives, over all contained files and
+ collects information about chunks stored in all directories it encountered.
+ It considers chunk IDs and their plaintext sizes (we don't have the compressed
+ size in the repository easily available) and adds up added/removed chunks'
+ sizes per direct parent directory and outputs a list of "directory: size".
+ You can use that list to find directories with a lot of "activity" - maybe
+ some of these are temporary or cache directories you did forget to exclude.
+ To not have these unwanted directories in your backups, you could carefully
+ exclude these in ``borg create`` (for future backups) or use ``borg recreate``
+ to re-create existing archives without these.
+ """
+ )
+ subparser = subparsers.add_parser(
+ "analyze",
+ parents=[common_parser],
+ add_help=False,
+ description=self.do_analyze.__doc__,
+ epilog=analyze_epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ help="analyze archives",
+ )
+ subparser.set_defaults(func=self.do_analyze)
+ define_archive_filters_group(subparser)
diff --git a/src/borg/testsuite/archiver/analyze_cmd.py b/src/borg/testsuite/archiver/analyze_cmd.py
new file mode 100644
index 000000000..eb6f6463c
--- /dev/null
+++ b/src/borg/testsuite/archiver/analyze_cmd.py
@@ -0,0 +1,41 @@
+import pathlib
+from ...constants import * # NOQA
+from . import cmd, generate_archiver_tests, RK_ENCRYPTION
+pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA
+def test_analyze(archivers, request):
+ def create_archive():
+ cmd(archiver, "create", "archive", archiver.input_path)
+ def analyze_archives():
+ return cmd(archiver, "analyze", "-a", "archive")
+ archiver = request.getfixturevalue(archivers)
+ cmd(archiver, "repo-create", RK_ENCRYPTION)
+ input_path = pathlib.Path(archiver.input_path)
+ # 1st archive
+ (input_path / "file1").write_text("1")
+ create_archive()
+ # 2nd archive
+ (input_path / "file2").write_text("22")
+ create_archive()
+ assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path
+ # 3rd archive
+ (input_path / "file3").write_text("333")
+ create_archive()
+ assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path
+ # 4th archive
+ (input_path / "file2").unlink()
+ create_archive()
+ assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1