mirror of
https://github.com/borgbackup/borg.git
synced 2025-01-01 12:45:34 +00:00
Merge pull request #8436 from ThomasWaldmann/analyze-cmd
analyze: changed chunks per directory
This commit is contained in:
commit
8cd951f324
7 changed files with 356 additions and 0 deletions
91
docs/man/borg-analyze.1
Normal file
91
docs/man/borg-analyze.1
Normal file
|
@ -0,0 +1,91 @@
|
|||
.\" Man page generated from reStructuredText.
|
||||
.
|
||||
.
|
||||
.nr rst2man-indent-level 0
|
||||
.
|
||||
.de1 rstReportMargin
|
||||
\\$1 \\n[an-margin]
|
||||
level \\n[rst2man-indent-level]
|
||||
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||
-
|
||||
\\n[rst2man-indent0]
|
||||
\\n[rst2man-indent1]
|
||||
\\n[rst2man-indent2]
|
||||
..
|
||||
.de1 INDENT
|
||||
.\" .rstReportMargin pre:
|
||||
. RS \\$1
|
||||
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
|
||||
. nr rst2man-indent-level +1
|
||||
.\" .rstReportMargin post:
|
||||
..
|
||||
.de UNINDENT
|
||||
. RE
|
||||
.\" indent \\n[an-margin]
|
||||
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||
.nr rst2man-indent-level -1
|
||||
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
|
||||
..
|
||||
.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
|
||||
.SH NAME
|
||||
borg-analyze \- Analyze archives
|
||||
.SH SYNOPSIS
|
||||
.sp
|
||||
borg [common options] analyze [options]
|
||||
.SH DESCRIPTION
|
||||
.sp
|
||||
Analyze archives to find \(dqhot spots\(dq.
|
||||
.sp
|
||||
Borg analyze relies on the usual archive matching options to select the
|
||||
archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
|
||||
Then it iterates over all matching archives, over all contained files and
|
||||
collects information about chunks stored in all directories it encountered.
|
||||
.sp
|
||||
It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
|
||||
size in the repository easily available) and adds up added/removed chunks\(aq
|
||||
sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
|
||||
.sp
|
||||
You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
|
||||
some of these are temporary or cache directories you did forget to exclude.
|
||||
.sp
|
||||
To not have these unwanted directories in your backups, you could carefully
|
||||
exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
|
||||
to re\-create existing archives without these.
|
||||
.SH OPTIONS
|
||||
.sp
|
||||
See \fIborg\-common(1)\fP for common options of Borg commands.
|
||||
.SS Archive filters
|
||||
.INDENT 0.0
|
||||
.TP
|
||||
.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
|
||||
only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
|
||||
.TP
|
||||
.BI \-\-sort\-by \ KEYS
|
||||
Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
|
||||
.TP
|
||||
.BI \-\-first \ N
|
||||
consider first N archives after other filters were applied
|
||||
.TP
|
||||
.BI \-\-last \ N
|
||||
consider last N archives after other filters were applied
|
||||
.TP
|
||||
.BI \-\-oldest \ TIMESPAN
|
||||
consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
|
||||
.TP
|
||||
.BI \-\-newest \ TIMESPAN
|
||||
consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
|
||||
.TP
|
||||
.BI \-\-older \ TIMESPAN
|
||||
consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
|
||||
.TP
|
||||
.BI \-\-newer \ TIMESPAN
|
||||
consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
|
||||
.UNINDENT
|
||||
.SH SEE ALSO
|
||||
.sp
|
||||
\fIborg\-common(1)\fP
|
||||
.SH AUTHOR
|
||||
The Borg Collective
|
||||
.\" Generated by docutils manpage writer.
|
||||
.
|
|
@ -57,6 +57,7 @@ Usage
|
|||
usage/delete
|
||||
usage/prune
|
||||
usage/info
|
||||
usage/analyze
|
||||
usage/mount
|
||||
usage/recreate
|
||||
usage/tar
|
||||
|
|
1
docs/usage/analyze.rst
Normal file
1
docs/usage/analyze.rst
Normal file
|
@ -0,0 +1 @@
|
|||
.. include:: analyze.rst.inc
|
84
docs/usage/analyze.rst.inc
Normal file
84
docs/usage/analyze.rst.inc
Normal file
|
@ -0,0 +1,84 @@
|
|||
.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
|
||||
|
||||
.. _borg_analyze:
|
||||
|
||||
borg analyze
|
||||
------------
|
||||
.. code-block:: none
|
||||
|
||||
borg [common options] analyze [options]
|
||||
|
||||
.. only:: html
|
||||
|
||||
.. class:: borg-options-table
|
||||
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| .. class:: borg-common-opt-ref |
|
||||
| |
|
||||
| :ref:`common_options` |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| **Archive filters** — Archive filters can be applied to repository targets. |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--first N`` | consider first N archives after other filters were applied |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--last N`` | consider last N archives after other filters were applied |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. |
|
||||
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<script type='text/javascript'>
|
||||
$(document).ready(function () {
|
||||
$('.borg-options-table colgroup').remove();
|
||||
})
|
||||
</script>
|
||||
|
||||
.. only:: latex
|
||||
|
||||
|
||||
|
||||
:ref:`common_options`
|
||||
|
|
||||
|
||||
Archive filters
|
||||
-a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives".
|
||||
--sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
|
||||
--first N consider first N archives after other filters were applied
|
||||
--last N consider last N archives after other filters were applied
|
||||
--oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
|
||||
--newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
|
||||
--older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
|
||||
--newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
|
||||
|
||||
|
||||
Description
|
||||
~~~~~~~~~~~
|
||||
|
||||
Analyze archives to find "hot spots".
|
||||
|
||||
Borg analyze relies on the usual archive matching options to select the
|
||||
archives that should be considered for analysis (e.g. ``-a series_name``).
|
||||
Then it iterates over all matching archives, over all contained files and
|
||||
collects information about chunks stored in all directories it encountered.
|
||||
|
||||
It considers chunk IDs and their plaintext sizes (we don't have the compressed
|
||||
size in the repository easily available) and adds up added/removed chunks'
|
||||
sizes per direct parent directory and outputs a list of "directory: size".
|
||||
|
||||
You can use that list to find directories with a lot of "activity" - maybe
|
||||
some of these are temporary or cache directories you did forget to exclude.
|
||||
|
||||
To not have these unwanted directories in your backups, you could carefully
|
||||
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
|
||||
to re-create existing archives without these.
|
|
@ -64,6 +64,7 @@ def get_func(args):
|
|||
raise Exception("expected func attributes not found")
|
||||
|
||||
|
||||
from .analyze_cmd import AnalyzeMixIn
|
||||
from .benchmark_cmd import BenchmarkMixIn
|
||||
from .check_cmd import CheckMixIn
|
||||
from .compact_cmd import CompactMixIn
|
||||
|
@ -94,6 +95,7 @@ def get_func(args):
|
|||
|
||||
|
||||
class Archiver(
|
||||
AnalyzeMixIn,
|
||||
BenchmarkMixIn,
|
||||
CheckMixIn,
|
||||
CompactMixIn,
|
||||
|
@ -332,6 +334,7 @@ def build_parser(self):
|
|||
|
||||
subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
|
||||
|
||||
self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
|
||||
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
|
||||
self.build_parser_check(subparsers, common_parser, mid_common_parser)
|
||||
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
|
||||
|
|
135
src/borg/archiver/analyze_cmd.py
Normal file
135
src/borg/archiver/analyze_cmd.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
import argparse
|
||||
from collections import defaultdict
|
||||
import os
|
||||
|
||||
from ._common import with_repository, define_archive_filters_group
|
||||
from ..archive import Archive
|
||||
from ..constants import * # NOQA
|
||||
from ..helpers import bin_to_hex, Error
|
||||
from ..helpers import ProgressIndicatorPercent
|
||||
from ..manifest import Manifest
|
||||
from ..remote import RemoteRepository
|
||||
from ..repository import Repository
|
||||
|
||||
from ..logger import create_logger
|
||||
|
||||
logger = create_logger()
|
||||
|
||||
|
||||
class ArchiveAnalyzer:
|
||||
def __init__(self, args, repository, manifest):
|
||||
self.args = args
|
||||
self.repository = repository
|
||||
assert isinstance(repository, (Repository, RemoteRepository))
|
||||
self.manifest = manifest
|
||||
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed
|
||||
|
||||
def analyze(self):
|
||||
logger.info("Starting archives analysis...")
|
||||
self.analyze_archives()
|
||||
self.report()
|
||||
logger.info("Finished archives analysis.")
|
||||
|
||||
def analyze_archives(self) -> None:
|
||||
"""Analyze all archives matching the given selection criteria."""
|
||||
archive_infos = self.manifest.archives.list_considering(self.args)
|
||||
num_archives = len(archive_infos)
|
||||
if num_archives < 2:
|
||||
raise Error("Need at least 2 archives to analyze.")
|
||||
|
||||
pi = ProgressIndicatorPercent(
|
||||
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
|
||||
)
|
||||
i = 0
|
||||
info = archive_infos[i]
|
||||
pi.show(i)
|
||||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
|
||||
base = self.analyze_archive(info.id)
|
||||
for i, info in enumerate(archive_infos[1:]):
|
||||
pi.show(i + 1)
|
||||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
|
||||
new = self.analyze_archive(info.id)
|
||||
self.analyze_change(base, new)
|
||||
base = new
|
||||
pi.finish()
|
||||
|
||||
def analyze_archive(self, id):
|
||||
"""compute the set of chunks for each directory in this archive"""
|
||||
archive = Archive(self.manifest, id)
|
||||
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
|
||||
for item in archive.iter_items():
|
||||
if "chunks" in item:
|
||||
item_chunks = dict(item.chunks) # chunk id -> plaintext size
|
||||
directory_path = os.path.dirname(item.path)
|
||||
chunks_by_path[directory_path].update(item_chunks)
|
||||
return chunks_by_path
|
||||
|
||||
def analyze_change(self, base, new):
|
||||
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
|
||||
|
||||
def analyze_path_change(path):
|
||||
base_chunks = base[path]
|
||||
new_chunks = new[path]
|
||||
# add up added chunks' sizes
|
||||
for id in new_chunks.keys() - base_chunks.keys():
|
||||
self.difference_by_path[directory_path] += new_chunks[id]
|
||||
# add up removed chunks' sizes
|
||||
for id in base_chunks.keys() - new_chunks.keys():
|
||||
self.difference_by_path[directory_path] += base_chunks[id]
|
||||
|
||||
for directory_path in base:
|
||||
analyze_path_change(directory_path)
|
||||
for directory_path in new:
|
||||
if directory_path not in base:
|
||||
analyze_path_change(directory_path)
|
||||
|
||||
def report(self):
|
||||
print()
|
||||
print("chunks added or removed by directory path")
|
||||
print("=========================================")
|
||||
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
|
||||
difference = self.difference_by_path[directory_path]
|
||||
print(f"{directory_path}: {difference}")
|
||||
|
||||
|
||||
class AnalyzeMixIn:
|
||||
@with_repository(compatibility=(Manifest.Operation.READ,))
|
||||
def do_analyze(self, args, repository, manifest):
|
||||
"""Analyze archives"""
|
||||
ArchiveAnalyzer(args, repository, manifest).analyze()
|
||||
|
||||
def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
|
||||
from ._common import process_epilog
|
||||
|
||||
analyze_epilog = process_epilog(
|
||||
"""
|
||||
Analyze archives to find "hot spots".
|
||||
|
||||
Borg analyze relies on the usual archive matching options to select the
|
||||
archives that should be considered for analysis (e.g. ``-a series_name``).
|
||||
Then it iterates over all matching archives, over all contained files and
|
||||
collects information about chunks stored in all directories it encountered.
|
||||
|
||||
It considers chunk IDs and their plaintext sizes (we don't have the compressed
|
||||
size in the repository easily available) and adds up added/removed chunks'
|
||||
sizes per direct parent directory and outputs a list of "directory: size".
|
||||
|
||||
You can use that list to find directories with a lot of "activity" - maybe
|
||||
some of these are temporary or cache directories you did forget to exclude.
|
||||
|
||||
To not have these unwanted directories in your backups, you could carefully
|
||||
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
|
||||
to re-create existing archives without these.
|
||||
"""
|
||||
)
|
||||
subparser = subparsers.add_parser(
|
||||
"analyze",
|
||||
parents=[common_parser],
|
||||
add_help=False,
|
||||
description=self.do_analyze.__doc__,
|
||||
epilog=analyze_epilog,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
help="analyze archives",
|
||||
)
|
||||
subparser.set_defaults(func=self.do_analyze)
|
||||
define_archive_filters_group(subparser)
|
41
src/borg/testsuite/archiver/analyze_cmd.py
Normal file
41
src/borg/testsuite/archiver/analyze_cmd.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import pathlib
|
||||
|
||||
from ...constants import * # NOQA
|
||||
from . import cmd, generate_archiver_tests, RK_ENCRYPTION
|
||||
|
||||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA
|
||||
|
||||
|
||||
def test_analyze(archivers, request):
|
||||
def create_archive():
|
||||
cmd(archiver, "create", "archive", archiver.input_path)
|
||||
|
||||
def analyze_archives():
|
||||
return cmd(archiver, "analyze", "-a", "archive")
|
||||
|
||||
archiver = request.getfixturevalue(archivers)
|
||||
|
||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||
input_path = pathlib.Path(archiver.input_path)
|
||||
|
||||
# 1st archive
|
||||
(input_path / "file1").write_text("1")
|
||||
create_archive()
|
||||
|
||||
# 2nd archive
|
||||
(input_path / "file2").write_text("22")
|
||||
create_archive()
|
||||
|
||||
assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path
|
||||
|
||||
# 3rd archive
|
||||
(input_path / "file3").write_text("333")
|
||||
create_archive()
|
||||
|
||||
assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path
|
||||
|
||||
# 4th archive
|
||||
(input_path / "file2").unlink()
|
||||
create_archive()
|
||||
|
||||
assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1
|
Loading…
Reference in a new issue