mirror of
https://github.com/borgbackup/borg.git
synced 2025-01-01 12:45:34 +00:00
Merge pull request #8436 from ThomasWaldmann/analyze-cmd
analyze: changed chunks per directory
This commit is contained in:
commit
8cd951f324
7 changed files with 356 additions and 0 deletions
91
docs/man/borg-analyze.1
Normal file
91
docs/man/borg-analyze.1
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
.\" Man page generated from reStructuredText.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.nr rst2man-indent-level 0
|
||||||
|
.
|
||||||
|
.de1 rstReportMargin
|
||||||
|
\\$1 \\n[an-margin]
|
||||||
|
level \\n[rst2man-indent-level]
|
||||||
|
level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||||
|
-
|
||||||
|
\\n[rst2man-indent0]
|
||||||
|
\\n[rst2man-indent1]
|
||||||
|
\\n[rst2man-indent2]
|
||||||
|
..
|
||||||
|
.de1 INDENT
|
||||||
|
.\" .rstReportMargin pre:
|
||||||
|
. RS \\$1
|
||||||
|
. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin]
|
||||||
|
. nr rst2man-indent-level +1
|
||||||
|
.\" .rstReportMargin post:
|
||||||
|
..
|
||||||
|
.de UNINDENT
|
||||||
|
. RE
|
||||||
|
.\" indent \\n[an-margin]
|
||||||
|
.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||||
|
.nr rst2man-indent-level -1
|
||||||
|
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
||||||
|
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
|
||||||
|
..
|
||||||
|
.TH "BORG-ANALYZE" 1 "2024-10-02" "" "borg backup tool"
|
||||||
|
.SH NAME
|
||||||
|
borg-analyze \- Analyze archives
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.sp
|
||||||
|
borg [common options] analyze [options]
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.sp
|
||||||
|
Analyze archives to find \(dqhot spots\(dq.
|
||||||
|
.sp
|
||||||
|
Borg analyze relies on the usual archive matching options to select the
|
||||||
|
archives that should be considered for analysis (e.g. \fB\-a series_name\fP).
|
||||||
|
Then it iterates over all matching archives, over all contained files and
|
||||||
|
collects information about chunks stored in all directories it encountered.
|
||||||
|
.sp
|
||||||
|
It considers chunk IDs and their plaintext sizes (we don\(aqt have the compressed
|
||||||
|
size in the repository easily available) and adds up added/removed chunks\(aq
|
||||||
|
sizes per direct parent directory and outputs a list of \(dqdirectory: size\(dq.
|
||||||
|
.sp
|
||||||
|
You can use that list to find directories with a lot of \(dqactivity\(dq \- maybe
|
||||||
|
some of these are temporary or cache directories you did forget to exclude.
|
||||||
|
.sp
|
||||||
|
To not have these unwanted directories in your backups, you could carefully
|
||||||
|
exclude these in \fBborg create\fP (for future backups) or use \fBborg recreate\fP
|
||||||
|
to re\-create existing archives without these.
|
||||||
|
.SH OPTIONS
|
||||||
|
.sp
|
||||||
|
See \fIborg\-common(1)\fP for common options of Borg commands.
|
||||||
|
.SS Archive filters
|
||||||
|
.INDENT 0.0
|
||||||
|
.TP
|
||||||
|
.BI \-a \ PATTERN\fR,\fB \ \-\-match\-archives \ PATTERN
|
||||||
|
only consider archives matching all patterns. see \(dqborg help match\-archives\(dq.
|
||||||
|
.TP
|
||||||
|
.BI \-\-sort\-by \ KEYS
|
||||||
|
Comma\-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
|
||||||
|
.TP
|
||||||
|
.BI \-\-first \ N
|
||||||
|
consider first N archives after other filters were applied
|
||||||
|
.TP
|
||||||
|
.BI \-\-last \ N
|
||||||
|
consider last N archives after other filters were applied
|
||||||
|
.TP
|
||||||
|
.BI \-\-oldest \ TIMESPAN
|
||||||
|
consider archives between the oldest archive\(aqs timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
|
||||||
|
.TP
|
||||||
|
.BI \-\-newest \ TIMESPAN
|
||||||
|
consider archives between the newest archive\(aqs timestamp and (newest \- TIMESPAN), e.g. 7d or 12m.
|
||||||
|
.TP
|
||||||
|
.BI \-\-older \ TIMESPAN
|
||||||
|
consider archives older than (now \- TIMESPAN), e.g. 7d or 12m.
|
||||||
|
.TP
|
||||||
|
.BI \-\-newer \ TIMESPAN
|
||||||
|
consider archives newer than (now \- TIMESPAN), e.g. 7d or 12m.
|
||||||
|
.UNINDENT
|
||||||
|
.SH SEE ALSO
|
||||||
|
.sp
|
||||||
|
\fIborg\-common(1)\fP
|
||||||
|
.SH AUTHOR
|
||||||
|
The Borg Collective
|
||||||
|
.\" Generated by docutils manpage writer.
|
||||||
|
.
|
|
@ -57,6 +57,7 @@ Usage
|
||||||
usage/delete
|
usage/delete
|
||||||
usage/prune
|
usage/prune
|
||||||
usage/info
|
usage/info
|
||||||
|
usage/analyze
|
||||||
usage/mount
|
usage/mount
|
||||||
usage/recreate
|
usage/recreate
|
||||||
usage/tar
|
usage/tar
|
||||||
|
|
1
docs/usage/analyze.rst
Normal file
1
docs/usage/analyze.rst
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.. include:: analyze.rst.inc
|
84
docs/usage/analyze.rst.inc
Normal file
84
docs/usage/analyze.rst.inc
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
.. IMPORTANT: this file is auto-generated from borg's built-in help, do not edit!
|
||||||
|
|
||||||
|
.. _borg_analyze:
|
||||||
|
|
||||||
|
borg analyze
|
||||||
|
------------
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
borg [common options] analyze [options]
|
||||||
|
|
||||||
|
.. only:: html
|
||||||
|
|
||||||
|
.. class:: borg-options-table
|
||||||
|
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| .. class:: borg-common-opt-ref |
|
||||||
|
| |
|
||||||
|
| :ref:`common_options` |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| **Archive filters** — Archive filters can be applied to repository targets. |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``-a PATTERN``, ``--match-archives PATTERN`` | only consider archives matching all patterns. see "borg help match-archives". |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--sort-by KEYS`` | Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--first N`` | consider first N archives after other filters were applied |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--last N`` | consider last N archives after other filters were applied |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--oldest TIMESPAN`` | consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m. |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--newest TIMESPAN`` | consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m. |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--older TIMESPAN`` | consider archives older than (now - TIMESPAN), e.g. 7d or 12m. |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``--newer TIMESPAN`` | consider archives newer than (now - TIMESPAN), e.g. 7d or 12m. |
|
||||||
|
+-----------------------------------------------------------------------------+----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<script type='text/javascript'>
|
||||||
|
$(document).ready(function () {
|
||||||
|
$('.borg-options-table colgroup').remove();
|
||||||
|
})
|
||||||
|
</script>
|
||||||
|
|
||||||
|
.. only:: latex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
:ref:`common_options`
|
||||||
|
|
|
||||||
|
|
||||||
|
Archive filters
|
||||||
|
-a PATTERN, --match-archives PATTERN only consider archives matching all patterns. see "borg help match-archives".
|
||||||
|
--sort-by KEYS Comma-separated list of sorting keys; valid keys are: timestamp, archive, name, id, tags, host, user; default is: timestamp
|
||||||
|
--first N consider first N archives after other filters were applied
|
||||||
|
--last N consider last N archives after other filters were applied
|
||||||
|
--oldest TIMESPAN consider archives between the oldest archive's timestamp and (oldest + TIMESPAN), e.g. 7d or 12m.
|
||||||
|
--newest TIMESPAN consider archives between the newest archive's timestamp and (newest - TIMESPAN), e.g. 7d or 12m.
|
||||||
|
--older TIMESPAN consider archives older than (now - TIMESPAN), e.g. 7d or 12m.
|
||||||
|
--newer TIMESPAN consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.
|
||||||
|
|
||||||
|
|
||||||
|
Description
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
Analyze archives to find "hot spots".
|
||||||
|
|
||||||
|
Borg analyze relies on the usual archive matching options to select the
|
||||||
|
archives that should be considered for analysis (e.g. ``-a series_name``).
|
||||||
|
Then it iterates over all matching archives, over all contained files and
|
||||||
|
collects information about chunks stored in all directories it encountered.
|
||||||
|
|
||||||
|
It considers chunk IDs and their plaintext sizes (we don't have the compressed
|
||||||
|
size in the repository easily available) and adds up added/removed chunks'
|
||||||
|
sizes per direct parent directory and outputs a list of "directory: size".
|
||||||
|
|
||||||
|
You can use that list to find directories with a lot of "activity" - maybe
|
||||||
|
some of these are temporary or cache directories you did forget to exclude.
|
||||||
|
|
||||||
|
To not have these unwanted directories in your backups, you could carefully
|
||||||
|
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
|
||||||
|
to re-create existing archives without these.
|
|
@ -64,6 +64,7 @@ def get_func(args):
|
||||||
raise Exception("expected func attributes not found")
|
raise Exception("expected func attributes not found")
|
||||||
|
|
||||||
|
|
||||||
|
from .analyze_cmd import AnalyzeMixIn
|
||||||
from .benchmark_cmd import BenchmarkMixIn
|
from .benchmark_cmd import BenchmarkMixIn
|
||||||
from .check_cmd import CheckMixIn
|
from .check_cmd import CheckMixIn
|
||||||
from .compact_cmd import CompactMixIn
|
from .compact_cmd import CompactMixIn
|
||||||
|
@ -94,6 +95,7 @@ def get_func(args):
|
||||||
|
|
||||||
|
|
||||||
class Archiver(
|
class Archiver(
|
||||||
|
AnalyzeMixIn,
|
||||||
BenchmarkMixIn,
|
BenchmarkMixIn,
|
||||||
CheckMixIn,
|
CheckMixIn,
|
||||||
CompactMixIn,
|
CompactMixIn,
|
||||||
|
@ -332,6 +334,7 @@ def build_parser(self):
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
|
subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")
|
||||||
|
|
||||||
|
self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
|
||||||
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
|
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
|
||||||
self.build_parser_check(subparsers, common_parser, mid_common_parser)
|
self.build_parser_check(subparsers, common_parser, mid_common_parser)
|
||||||
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
|
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
|
||||||
|
|
135
src/borg/archiver/analyze_cmd.py
Normal file
135
src/borg/archiver/analyze_cmd.py
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ._common import with_repository, define_archive_filters_group
|
||||||
|
from ..archive import Archive
|
||||||
|
from ..constants import * # NOQA
|
||||||
|
from ..helpers import bin_to_hex, Error
|
||||||
|
from ..helpers import ProgressIndicatorPercent
|
||||||
|
from ..manifest import Manifest
|
||||||
|
from ..remote import RemoteRepository
|
||||||
|
from ..repository import Repository
|
||||||
|
|
||||||
|
from ..logger import create_logger
|
||||||
|
|
||||||
|
logger = create_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveAnalyzer:
|
||||||
|
def __init__(self, args, repository, manifest):
|
||||||
|
self.args = args
|
||||||
|
self.repository = repository
|
||||||
|
assert isinstance(repository, (Repository, RemoteRepository))
|
||||||
|
self.manifest = manifest
|
||||||
|
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed
|
||||||
|
|
||||||
|
def analyze(self):
|
||||||
|
logger.info("Starting archives analysis...")
|
||||||
|
self.analyze_archives()
|
||||||
|
self.report()
|
||||||
|
logger.info("Finished archives analysis.")
|
||||||
|
|
||||||
|
def analyze_archives(self) -> None:
|
||||||
|
"""Analyze all archives matching the given selection criteria."""
|
||||||
|
archive_infos = self.manifest.archives.list_considering(self.args)
|
||||||
|
num_archives = len(archive_infos)
|
||||||
|
if num_archives < 2:
|
||||||
|
raise Error("Need at least 2 archives to analyze.")
|
||||||
|
|
||||||
|
pi = ProgressIndicatorPercent(
|
||||||
|
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
|
||||||
|
)
|
||||||
|
i = 0
|
||||||
|
info = archive_infos[i]
|
||||||
|
pi.show(i)
|
||||||
|
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
|
||||||
|
base = self.analyze_archive(info.id)
|
||||||
|
for i, info in enumerate(archive_infos[1:]):
|
||||||
|
pi.show(i + 1)
|
||||||
|
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
|
||||||
|
new = self.analyze_archive(info.id)
|
||||||
|
self.analyze_change(base, new)
|
||||||
|
base = new
|
||||||
|
pi.finish()
|
||||||
|
|
||||||
|
def analyze_archive(self, id):
|
||||||
|
"""compute the set of chunks for each directory in this archive"""
|
||||||
|
archive = Archive(self.manifest, id)
|
||||||
|
chunks_by_path = defaultdict(dict) # collect all chunk IDs generated from files in this directory path
|
||||||
|
for item in archive.iter_items():
|
||||||
|
if "chunks" in item:
|
||||||
|
item_chunks = dict(item.chunks) # chunk id -> plaintext size
|
||||||
|
directory_path = os.path.dirname(item.path)
|
||||||
|
chunks_by_path[directory_path].update(item_chunks)
|
||||||
|
return chunks_by_path
|
||||||
|
|
||||||
|
def analyze_change(self, base, new):
|
||||||
|
"""for each directory path, sum up the changed (removed or added) chunks' sizes between base and new."""
|
||||||
|
|
||||||
|
def analyze_path_change(path):
|
||||||
|
base_chunks = base[path]
|
||||||
|
new_chunks = new[path]
|
||||||
|
# add up added chunks' sizes
|
||||||
|
for id in new_chunks.keys() - base_chunks.keys():
|
||||||
|
self.difference_by_path[directory_path] += new_chunks[id]
|
||||||
|
# add up removed chunks' sizes
|
||||||
|
for id in base_chunks.keys() - new_chunks.keys():
|
||||||
|
self.difference_by_path[directory_path] += base_chunks[id]
|
||||||
|
|
||||||
|
for directory_path in base:
|
||||||
|
analyze_path_change(directory_path)
|
||||||
|
for directory_path in new:
|
||||||
|
if directory_path not in base:
|
||||||
|
analyze_path_change(directory_path)
|
||||||
|
|
||||||
|
def report(self):
|
||||||
|
print()
|
||||||
|
print("chunks added or removed by directory path")
|
||||||
|
print("=========================================")
|
||||||
|
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
|
||||||
|
difference = self.difference_by_path[directory_path]
|
||||||
|
print(f"{directory_path}: {difference}")
|
||||||
|
|
||||||
|
|
||||||
|
class AnalyzeMixIn:
|
||||||
|
@with_repository(compatibility=(Manifest.Operation.READ,))
|
||||||
|
def do_analyze(self, args, repository, manifest):
|
||||||
|
"""Analyze archives"""
|
||||||
|
ArchiveAnalyzer(args, repository, manifest).analyze()
|
||||||
|
|
||||||
|
def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
|
||||||
|
from ._common import process_epilog
|
||||||
|
|
||||||
|
analyze_epilog = process_epilog(
|
||||||
|
"""
|
||||||
|
Analyze archives to find "hot spots".
|
||||||
|
|
||||||
|
Borg analyze relies on the usual archive matching options to select the
|
||||||
|
archives that should be considered for analysis (e.g. ``-a series_name``).
|
||||||
|
Then it iterates over all matching archives, over all contained files and
|
||||||
|
collects information about chunks stored in all directories it encountered.
|
||||||
|
|
||||||
|
It considers chunk IDs and their plaintext sizes (we don't have the compressed
|
||||||
|
size in the repository easily available) and adds up added/removed chunks'
|
||||||
|
sizes per direct parent directory and outputs a list of "directory: size".
|
||||||
|
|
||||||
|
You can use that list to find directories with a lot of "activity" - maybe
|
||||||
|
some of these are temporary or cache directories you did forget to exclude.
|
||||||
|
|
||||||
|
To not have these unwanted directories in your backups, you could carefully
|
||||||
|
exclude these in ``borg create`` (for future backups) or use ``borg recreate``
|
||||||
|
to re-create existing archives without these.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
subparser = subparsers.add_parser(
|
||||||
|
"analyze",
|
||||||
|
parents=[common_parser],
|
||||||
|
add_help=False,
|
||||||
|
description=self.do_analyze.__doc__,
|
||||||
|
epilog=analyze_epilog,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
help="analyze archives",
|
||||||
|
)
|
||||||
|
subparser.set_defaults(func=self.do_analyze)
|
||||||
|
define_archive_filters_group(subparser)
|
41
src/borg/testsuite/archiver/analyze_cmd.py
Normal file
41
src/borg/testsuite/archiver/analyze_cmd.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
from ...constants import * # NOQA
|
||||||
|
from . import cmd, generate_archiver_tests, RK_ENCRYPTION
|
||||||
|
|
||||||
|
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA
|
||||||
|
|
||||||
|
|
||||||
|
def test_analyze(archivers, request):
|
||||||
|
def create_archive():
|
||||||
|
cmd(archiver, "create", "archive", archiver.input_path)
|
||||||
|
|
||||||
|
def analyze_archives():
|
||||||
|
return cmd(archiver, "analyze", "-a", "archive")
|
||||||
|
|
||||||
|
archiver = request.getfixturevalue(archivers)
|
||||||
|
|
||||||
|
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||||
|
input_path = pathlib.Path(archiver.input_path)
|
||||||
|
|
||||||
|
# 1st archive
|
||||||
|
(input_path / "file1").write_text("1")
|
||||||
|
create_archive()
|
||||||
|
|
||||||
|
# 2nd archive
|
||||||
|
(input_path / "file2").write_text("22")
|
||||||
|
create_archive()
|
||||||
|
|
||||||
|
assert "/input: 2" in analyze_archives() # 2nd archive added 1 chunk for input path
|
||||||
|
|
||||||
|
# 3rd archive
|
||||||
|
(input_path / "file3").write_text("333")
|
||||||
|
create_archive()
|
||||||
|
|
||||||
|
assert "/input: 5" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path
|
||||||
|
|
||||||
|
# 4th archive
|
||||||
|
(input_path / "file2").unlink()
|
||||||
|
create_archive()
|
||||||
|
|
||||||
|
assert "/input: 7" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1
|
Loading…
Reference in a new issue