1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-21 23:33:07 +00:00

compact: add --stats option

with --stats it will be as slow as before, listing all repo objs.

without --stats, it will be faster by using the cached chunks index.
This commit is contained in:
Thomas Waldmann 2024-11-23 19:41:37 +01:00
parent 4c1e2bc4b0
commit a46131bec5
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
2 changed files with 73 additions and 28 deletions

View file

@ -3,7 +3,7 @@
from ._common import with_repository
from ..archive import Archive
from ..cache import write_chunkindex_to_repo_cache
from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
from ..constants import * # NOQA
from ..hashindex import ChunkIndex, ChunkIndexEntry
from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
@ -18,7 +18,7 @@
class ArchiveGarbageCollector:
def __init__(self, repository, manifest):
def __init__(self, repository, manifest, *, stats):
self.repository = repository
assert isinstance(repository, (Repository, RemoteRepository))
self.manifest = manifest
@ -26,17 +26,17 @@ def __init__(self, repository, manifest):
self.total_files = None # overall number of source files written to all archives in this repo
self.total_size = None # overall size of source file content data written to all archives
self.archives_count = None # number of archives
self.stats = stats # compute repo space usage before/after - lists all repo objects, can be slow.
@property
def repository_size(self):
if self.chunks is None:
if self.chunks is None or not self.stats:
return None
return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes
def garbage_collect(self):
"""Removes unused chunks from a repository."""
logger.info("Starting compaction / garbage collection...")
logger.info("Getting object IDs present in the repository...")
self.chunks = self.get_repository_chunks()
logger.info("Computing object IDs used by archives...")
(self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
@ -47,20 +47,30 @@ def garbage_collect(self):
logger.info("Finished compaction / garbage collection...")
def get_repository_chunks(self) -> ChunkIndex:
"""Build a dict id -> size of all chunks present in the repository"""
chunks = ChunkIndex()
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
# we add this id to the chunks index (as unused chunk), because
# we do not know yet whether it is actually referenced from some archives.
# we "abuse" the size field here. usually there is the plaintext size,
# but we use it for the size of the stored object here.
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
"""return a chunks index"""
if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes.
logger.info("Getting object IDs present in the repository...")
chunks = ChunkIndex()
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
# we add this id to the chunks index (as unused chunk), because
# we do not know yet whether it is actually referenced from some archives.
# we "abuse" the size field here. usually there is the plaintext size,
# but we use it for the size of the stored object here.
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
else: # faster: rely on existing chunks index (with flags F_NONE and size 0).
logger.info("Getting object IDs from cached chunks index...")
chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
return chunks
def save_chunk_index(self):
# write_chunkindex_to_repo now removes all flags and size infos.
# we need this, as we put the wrong size in there.
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
if self.stats:
# write_chunkindex_to_repo now removes all flags and size infos.
# we need this, as we put the wrong size in there to support --stats computations.
write_chunkindex_to_repo_cache(
self.repository, self.chunks, clear=True, force_write=True, delete_other=True
)
else:
self.chunks.clear() # we already have updated the repo cache in get_repository_chunks
self.chunks = None # nothing there (cleared!)
def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
@ -75,7 +85,8 @@ def use_it(id, *, wanted=False):
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
reappeared_chunks.add(id)
else:
# we do NOT have this chunk in the repository!
# with --stats: we do NOT have this chunk in the repository!
# without --stats: we do not have this chunk or the chunks index is incomplete.
missing_chunks.add(id)
missing_chunks: set[bytes] = set()
@ -153,15 +164,18 @@ def report_and_delete(self):
logger.info(
f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
)
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
if self.stats:
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
else:
logger.info(f"Repository has data stored in {count} objects.")
class CompactMixIn:
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
def do_compact(self, args, repository, manifest):
"""Collect garbage in repository"""
ArchiveGarbageCollector(repository, manifest).garbage_collect()
ArchiveGarbageCollector(repository, manifest, stats=args.stats).garbage_collect()
def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog
@ -198,6 +212,16 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
might not want to do that unless there are signs of lost archives (e.g. when
seeing fatal errors when creating backups or when archives are missing in
``borg repo-list``).
When giving the ``--stats`` option, borg will internally list all repository
objects to determine their existence AND stored size. It will build a fresh
chunks index from that information and cache it in the repository. For some
types of repositories, this might be very slow. It will tell you the sum of
stored object sizes, before and after compaction.
Without ``--stats``, borg will rely on the cached chunks index to determine
existing object IDs (but there is no stored size information in the index,
thus it can't compute before/after compaction size statistics).
"""
)
subparser = subparsers.add_parser(
@ -210,3 +234,7 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
help="compact repository",
)
subparser.set_defaults(func=self.do_compact)
subparser.add_argument(
"-s", "--stats", dest="stats", action="store_true", help="print statistics (might be much slower)"
)

View file

@ -1,35 +1,48 @@
import pytest
from ...constants import * # NOQA
from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
def test_compact_empty_repository(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_empty_repository(archivers, request, stats):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Repository size is 0 B in 0 objects." in output
if stats:
assert "Repository size is 0 B in 0 objects." in output
else:
assert "Repository has data stored in 0 objects." in output
assert "Finished compaction" in output
def test_compact_after_deleting_all_archives(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_after_deleting_all_archives(archivers, request, stats):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_src_archive(archiver, "archive")
cmd(archiver, "delete", "-a", "archive", exit_code=0)
output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Deleting " in output
assert "Repository size is 0 B in 0 objects." in output
if stats:
assert "Repository size is 0 B in 0 objects." in output
else:
assert "Repository has data stored in 0 objects." in output
assert "Finished compaction" in output
def test_compact_after_deleting_some_archives(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_after_deleting_some_archives(archivers, request, stats):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
@ -37,8 +50,12 @@ def test_compact_after_deleting_some_archives(archivers, request):
create_src_archive(archiver, "archive2")
cmd(archiver, "delete", "-a", "archive1", exit_code=0)
output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Deleting " in output
assert "Repository size is 0 B in 0 objects." not in output
if stats:
assert "Repository size is 0 B in 0 objects." not in output
else:
assert "Repository has data stored in 0 objects." not in output
assert "Finished compaction" in output