mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-21 23:33:07 +00:00
compact: add --stats option
with --stats it will be as slow as before, listing all repo objs. without --stats, it will be faster by using the cached chunks index.
This commit is contained in:
parent
4c1e2bc4b0
commit
a46131bec5
2 changed files with 73 additions and 28 deletions
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
from ._common import with_repository
|
from ._common import with_repository
|
||||||
from ..archive import Archive
|
from ..archive import Archive
|
||||||
from ..cache import write_chunkindex_to_repo_cache
|
from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
|
||||||
from ..constants import * # NOQA
|
from ..constants import * # NOQA
|
||||||
from ..hashindex import ChunkIndex, ChunkIndexEntry
|
from ..hashindex import ChunkIndex, ChunkIndexEntry
|
||||||
from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
|
from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
|
||||||
|
@ -18,7 +18,7 @@
|
||||||
|
|
||||||
|
|
||||||
class ArchiveGarbageCollector:
|
class ArchiveGarbageCollector:
|
||||||
def __init__(self, repository, manifest):
|
def __init__(self, repository, manifest, *, stats):
|
||||||
self.repository = repository
|
self.repository = repository
|
||||||
assert isinstance(repository, (Repository, RemoteRepository))
|
assert isinstance(repository, (Repository, RemoteRepository))
|
||||||
self.manifest = manifest
|
self.manifest = manifest
|
||||||
|
@ -26,17 +26,17 @@ def __init__(self, repository, manifest):
|
||||||
self.total_files = None # overall number of source files written to all archives in this repo
|
self.total_files = None # overall number of source files written to all archives in this repo
|
||||||
self.total_size = None # overall size of source file content data written to all archives
|
self.total_size = None # overall size of source file content data written to all archives
|
||||||
self.archives_count = None # number of archives
|
self.archives_count = None # number of archives
|
||||||
|
self.stats = stats # compute repo space usage before/after - lists all repo objects, can be slow.
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def repository_size(self):
|
def repository_size(self):
|
||||||
if self.chunks is None:
|
if self.chunks is None or not self.stats:
|
||||||
return None
|
return None
|
||||||
return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes
|
return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes
|
||||||
|
|
||||||
def garbage_collect(self):
|
def garbage_collect(self):
|
||||||
"""Removes unused chunks from a repository."""
|
"""Removes unused chunks from a repository."""
|
||||||
logger.info("Starting compaction / garbage collection...")
|
logger.info("Starting compaction / garbage collection...")
|
||||||
logger.info("Getting object IDs present in the repository...")
|
|
||||||
self.chunks = self.get_repository_chunks()
|
self.chunks = self.get_repository_chunks()
|
||||||
logger.info("Computing object IDs used by archives...")
|
logger.info("Computing object IDs used by archives...")
|
||||||
(self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
|
(self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
|
||||||
|
@ -47,20 +47,30 @@ def garbage_collect(self):
|
||||||
logger.info("Finished compaction / garbage collection...")
|
logger.info("Finished compaction / garbage collection...")
|
||||||
|
|
||||||
def get_repository_chunks(self) -> ChunkIndex:
|
def get_repository_chunks(self) -> ChunkIndex:
|
||||||
"""Build a dict id -> size of all chunks present in the repository"""
|
"""return a chunks index"""
|
||||||
chunks = ChunkIndex()
|
if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes.
|
||||||
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
|
logger.info("Getting object IDs present in the repository...")
|
||||||
# we add this id to the chunks index (as unused chunk), because
|
chunks = ChunkIndex()
|
||||||
# we do not know yet whether it is actually referenced from some archives.
|
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
|
||||||
# we "abuse" the size field here. usually there is the plaintext size,
|
# we add this id to the chunks index (as unused chunk), because
|
||||||
# but we use it for the size of the stored object here.
|
# we do not know yet whether it is actually referenced from some archives.
|
||||||
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
|
# we "abuse" the size field here. usually there is the plaintext size,
|
||||||
|
# but we use it for the size of the stored object here.
|
||||||
|
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
|
||||||
|
else: # faster: rely on existing chunks index (with flags F_NONE and size 0).
|
||||||
|
logger.info("Getting object IDs from cached chunks index...")
|
||||||
|
chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def save_chunk_index(self):
|
def save_chunk_index(self):
|
||||||
# write_chunkindex_to_repo now removes all flags and size infos.
|
if self.stats:
|
||||||
# we need this, as we put the wrong size in there.
|
# write_chunkindex_to_repo now removes all flags and size infos.
|
||||||
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
|
# we need this, as we put the wrong size in there to support --stats computations.
|
||||||
|
write_chunkindex_to_repo_cache(
|
||||||
|
self.repository, self.chunks, clear=True, force_write=True, delete_other=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.chunks.clear() # we already have updated the repo cache in get_repository_chunks
|
||||||
self.chunks = None # nothing there (cleared!)
|
self.chunks = None # nothing there (cleared!)
|
||||||
|
|
||||||
def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
|
def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
|
||||||
|
@ -75,7 +85,8 @@ def use_it(id, *, wanted=False):
|
||||||
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
|
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
|
||||||
reappeared_chunks.add(id)
|
reappeared_chunks.add(id)
|
||||||
else:
|
else:
|
||||||
# we do NOT have this chunk in the repository!
|
# with --stats: we do NOT have this chunk in the repository!
|
||||||
|
# without --stats: we do not have this chunk or the chunks index is incomplete.
|
||||||
missing_chunks.add(id)
|
missing_chunks.add(id)
|
||||||
|
|
||||||
missing_chunks: set[bytes] = set()
|
missing_chunks: set[bytes] = set()
|
||||||
|
@ -153,15 +164,18 @@ def report_and_delete(self):
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
|
f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
|
||||||
)
|
)
|
||||||
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
|
if self.stats:
|
||||||
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
|
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
|
||||||
|
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Repository has data stored in {count} objects.")
|
||||||
|
|
||||||
|
|
||||||
class CompactMixIn:
|
class CompactMixIn:
|
||||||
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
|
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
|
||||||
def do_compact(self, args, repository, manifest):
|
def do_compact(self, args, repository, manifest):
|
||||||
"""Collect garbage in repository"""
|
"""Collect garbage in repository"""
|
||||||
ArchiveGarbageCollector(repository, manifest).garbage_collect()
|
ArchiveGarbageCollector(repository, manifest, stats=args.stats).garbage_collect()
|
||||||
|
|
||||||
def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
|
def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
|
||||||
from ._common import process_epilog
|
from ._common import process_epilog
|
||||||
|
@ -198,6 +212,16 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
|
||||||
might not want to do that unless there are signs of lost archives (e.g. when
|
might not want to do that unless there are signs of lost archives (e.g. when
|
||||||
seeing fatal errors when creating backups or when archives are missing in
|
seeing fatal errors when creating backups or when archives are missing in
|
||||||
``borg repo-list``).
|
``borg repo-list``).
|
||||||
|
|
||||||
|
When giving the ``--stats`` option, borg will internally list all repository
|
||||||
|
objects to determine their existence AND stored size. It will build a fresh
|
||||||
|
chunks index from that information and cache it in the repository. For some
|
||||||
|
types of repositories, this might be very slow. It will tell you the sum of
|
||||||
|
stored object sizes, before and after compaction.
|
||||||
|
|
||||||
|
Without ``--stats``, borg will rely on the cached chunks index to determine
|
||||||
|
existing object IDs (but there is no stored size information in the index,
|
||||||
|
thus it can't compute before/after compaction size statistics).
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
subparser = subparsers.add_parser(
|
subparser = subparsers.add_parser(
|
||||||
|
@ -210,3 +234,7 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
|
||||||
help="compact repository",
|
help="compact repository",
|
||||||
)
|
)
|
||||||
subparser.set_defaults(func=self.do_compact)
|
subparser.set_defaults(func=self.do_compact)
|
||||||
|
|
||||||
|
subparser.add_argument(
|
||||||
|
"-s", "--stats", dest="stats", action="store_true", help="print statistics (might be much slower)"
|
||||||
|
)
|
||||||
|
|
|
@ -1,35 +1,48 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
from ...constants import * # NOQA
|
from ...constants import * # NOQA
|
||||||
from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
|
from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
|
||||||
|
|
||||||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
|
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
|
||||||
|
|
||||||
|
|
||||||
def test_compact_empty_repository(archivers, request):
|
@pytest.mark.parametrize("stats", (True, False))
|
||||||
|
def test_compact_empty_repository(archivers, request, stats):
|
||||||
archiver = request.getfixturevalue(archivers)
|
archiver = request.getfixturevalue(archivers)
|
||||||
|
|
||||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||||
|
|
||||||
output = cmd(archiver, "compact", "-v", exit_code=0)
|
args = ("-v", "--stats") if stats else ("-v",)
|
||||||
|
output = cmd(archiver, "compact", *args, exit_code=0)
|
||||||
assert "Starting compaction" in output
|
assert "Starting compaction" in output
|
||||||
assert "Repository size is 0 B in 0 objects." in output
|
if stats:
|
||||||
|
assert "Repository size is 0 B in 0 objects." in output
|
||||||
|
else:
|
||||||
|
assert "Repository has data stored in 0 objects." in output
|
||||||
assert "Finished compaction" in output
|
assert "Finished compaction" in output
|
||||||
|
|
||||||
|
|
||||||
def test_compact_after_deleting_all_archives(archivers, request):
|
@pytest.mark.parametrize("stats", (True, False))
|
||||||
|
def test_compact_after_deleting_all_archives(archivers, request, stats):
|
||||||
archiver = request.getfixturevalue(archivers)
|
archiver = request.getfixturevalue(archivers)
|
||||||
|
|
||||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||||
create_src_archive(archiver, "archive")
|
create_src_archive(archiver, "archive")
|
||||||
cmd(archiver, "delete", "-a", "archive", exit_code=0)
|
cmd(archiver, "delete", "-a", "archive", exit_code=0)
|
||||||
|
|
||||||
output = cmd(archiver, "compact", "-v", exit_code=0)
|
args = ("-v", "--stats") if stats else ("-v",)
|
||||||
|
output = cmd(archiver, "compact", *args, exit_code=0)
|
||||||
assert "Starting compaction" in output
|
assert "Starting compaction" in output
|
||||||
assert "Deleting " in output
|
assert "Deleting " in output
|
||||||
assert "Repository size is 0 B in 0 objects." in output
|
if stats:
|
||||||
|
assert "Repository size is 0 B in 0 objects." in output
|
||||||
|
else:
|
||||||
|
assert "Repository has data stored in 0 objects." in output
|
||||||
assert "Finished compaction" in output
|
assert "Finished compaction" in output
|
||||||
|
|
||||||
|
|
||||||
def test_compact_after_deleting_some_archives(archivers, request):
|
@pytest.mark.parametrize("stats", (True, False))
|
||||||
|
def test_compact_after_deleting_some_archives(archivers, request, stats):
|
||||||
archiver = request.getfixturevalue(archivers)
|
archiver = request.getfixturevalue(archivers)
|
||||||
|
|
||||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||||
|
@ -37,8 +50,12 @@ def test_compact_after_deleting_some_archives(archivers, request):
|
||||||
create_src_archive(archiver, "archive2")
|
create_src_archive(archiver, "archive2")
|
||||||
cmd(archiver, "delete", "-a", "archive1", exit_code=0)
|
cmd(archiver, "delete", "-a", "archive1", exit_code=0)
|
||||||
|
|
||||||
output = cmd(archiver, "compact", "-v", exit_code=0)
|
args = ("-v", "--stats") if stats else ("-v",)
|
||||||
|
output = cmd(archiver, "compact", *args, exit_code=0)
|
||||||
assert "Starting compaction" in output
|
assert "Starting compaction" in output
|
||||||
assert "Deleting " in output
|
assert "Deleting " in output
|
||||||
assert "Repository size is 0 B in 0 objects." not in output
|
if stats:
|
||||||
|
assert "Repository size is 0 B in 0 objects." not in output
|
||||||
|
else:
|
||||||
|
assert "Repository has data stored in 0 objects." not in output
|
||||||
assert "Finished compaction" in output
|
assert "Finished compaction" in output
|
||||||
|
|
Loading…
Reference in a new issue