compute the deduplicated size before compression

so we do not need csize for it.
2022-06-11 22:29:43 +02:00 · 2022-06-11 22:29:43 +02:00 · 19dfbe5c5c
parent 1fd571a4d0
commit 19dfbe5c5c
4 changed files with 40 additions and 27 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -58,38 +58,45 @@ class Statistics:
    def __init__(self, output_json=False, iec=False):
        self.output_json = output_json
        self.iec = iec
-        self.osize = self.nfiles = 0
-        self.osize_parts = self.nfiles_parts = 0
+        self.osize = self.usize = self.nfiles = 0
+        self.osize_parts = self.usize_parts = self.nfiles_parts = 0
        self.last_progress = 0  # timestamp when last progress was shown

-    def update(self, size, part=False):
+    def update(self, size, unique, part=False):
        if not part:
            self.osize += size
+            if unique:
+                self.usize += size
        else:
            self.osize_parts += size
+            if unique:
+                self.usize_parts += size

    def __add__(self, other):
        if not isinstance(other, Statistics):
            raise TypeError('can only add Statistics objects')
        stats = Statistics(self.output_json, self.iec)
        stats.osize = self.osize + other.osize
+        stats.usize = self.usize + other.usize
        stats.nfiles = self.nfiles + other.nfiles
        stats.osize_parts = self.osize_parts + other.osize_parts
+        stats.usize_parts = self.usize_parts + other.usize_parts
        stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
        return stats

-    summary = "{label:15} {stats.osize_fmt:>20s}"
+    summary = "{label:15} {stats.osize_fmt:>20s} {stats.usize_fmt:>20s}"

    def __str__(self):
        return self.summary.format(stats=self, label='This archive:')

    def __repr__(self):
-        return "<{cls} object at {hash:#x} ({self.osize})>".format(
+        return "<{cls} object at {hash:#x} ({self.osize}, {self.usize})>".format(
            cls=type(self).__name__, hash=id(self), self=self)

    def as_dict(self):
        return {
            'original_size': FileSize(self.osize, iec=self.iec),
+            'deduplicated_size': FileSize(self.usize, iec=self.iec),
            'nfiles': self.nfiles,
        }

@ -114,6 +121,10 @@ class Statistics:
    def osize_fmt(self):
        return format_file_size(self.osize, iec=self.iec)

+    @property
+    def usize_fmt(self):
+        return format_file_size(self.usize, iec=self.iec)
+
    def show_progress(self, item=None, final=False, stream=None, dt=None):
        now = time.monotonic()
        if dt is None or now - self.last_progress > dt:
@ -134,7 +145,7 @@ class Statistics:
            else:
                columns, lines = get_terminal_size()
                if not final:
-                    msg = '{0.osize_fmt} O {0.nfiles} N '.format(self)
+                    msg = '{0.osize_fmt} O {0.usize_fmt} U {0.nfiles} N '.format(self)
                    path = remove_surrogates(item.path) if item else ''
                    space = columns - swidth(msg)
                    if space < 12:
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -99,7 +99,7 @@ except BaseException:
 assert EXIT_ERROR == 2, "EXIT_ERROR is not 2, as expected - fix assert AND exception handler right above this line."


-STATS_HEADER = "                       Original size"
+STATS_HEADER = "                       Original size    Deduplicated size"

 PURE_PYTHON_MSGPACK_WARNING = "Using a pure-python msgpack! This will result in lower performance."

@ -1797,8 +1797,8 @@ class Archiver:
                Command line: {command_line}
                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                ------------------------------------------------------------------------------
-                                       Original size
-                This archive:   {stats[original_size]:>20s}
+                                       Original size    Deduplicated size
+                This archive:   {stats[original_size]:>20s} {stats[deduplicated_size]:>20s}
                {cache}
                """).strip().format(cache=cache, **info))
            if self.exit_code:
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -406,7 +406,7 @@ class Cache:

 class CacheStatsMixin:
    str_format = """\
-All archives:   {0.total_size:>20s}
+All archives:   {0.total_size:>20s} {0.unique_size:>20s}

                       Unique chunks         Total chunks
 Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
@ -440,7 +440,7 @@ Chunk index:    {0.total_unique_chunks:20d} {0.total_chunks:20d}"""

    def format_tuple(self):
        stats = self.stats()
-        for field in ['total_size', ]:
+        for field in ['total_size', 'unique_size']:
            stats[field] = format_file_size(stats[field], iec=self.iec)
        return self.Summary(**stats)

@ -905,7 +905,7 @@ class LocalCache(CacheStatsMixin):
        data = self.key.encrypt(id, chunk, compress=compress)
        self.repository.put(id, data, wait=wait)
        self.chunks.add(id, 1, size)
-        stats.update(size)
+        stats.update(size, not refcount)
        return ChunkListEntry(id, size)

    def seen_chunk(self, id, size=None):
@ -921,7 +921,7 @@ class LocalCache(CacheStatsMixin):
        if not self.txn_active:
            self.begin_txn()
        count, _size = self.chunks.incref(id)
-        stats.update(_size, part=part)
+        stats.update(_size, False, part=part)
        return ChunkListEntry(id, _size)

    def chunk_decref(self, id, stats, wait=True, part=False):
@ -931,9 +931,9 @@ class LocalCache(CacheStatsMixin):
        if count == 0:
            del self.chunks[id]
            self.repository.delete(id, wait=wait)
-            stats.update(-size, part=part)
+            stats.update(-size, True, part=part)
        else:
-            stats.update(-size, part=part)
+            stats.update(-size, False, part=part)

    def file_known_and_unchanged(self, hashed_path, path_hash, st):
        """
@ -1072,7 +1072,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
        data = self.key.encrypt(id, chunk, compress=compress)
        self.repository.put(id, data, wait=wait)
        self.chunks.add(id, 1, size)
-        stats.update(size)
+        stats.update(size, not refcount)
        return ChunkListEntry(id, size)

    def seen_chunk(self, id, size=None):
@ -1094,7 +1094,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
        # size or add_chunk); we can't add references to those (size=0 is invalid) and generally don't try to.
        size = _size or size
        assert size
-        stats.update(size, part=part)
+        stats.update(size, False, part=part)
        return ChunkListEntry(id, size)

    def chunk_decref(self, id, stats, wait=True, part=False):
@ -1104,9 +1104,9 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
        if count == 0:
            del self.chunks[id]
            self.repository.delete(id, wait=wait)
-            stats.update(-size, part=part)
+            stats.update(-size, True, part=part)
        else:
-            stats.update(-size, part=part)
+            stats.update(-size, False, part=part)

    def commit(self):
        if not self._txn_active:
--- a/src/borg/testsuite/archive.py
+++ b/src/borg/testsuite/archive.py
@ -19,44 +19,46 @@ from ..platform import uid2user, gid2group
@pytest.fixture()
 def stats():
    stats = Statistics()
-    stats.update(20)
+    stats.update(20, unique=True)
    return stats


 def test_stats_basic(stats):
    assert stats.osize == 20
-    stats.update(20)
+    assert stats.usize == 20
+    stats.update(20, unique=False)
    assert stats.osize == 40
+    assert stats.usize == 20


 def tests_stats_progress(stats, monkeypatch, columns=80):
    monkeypatch.setenv('COLUMNS', str(columns))
    out = StringIO()
    stats.show_progress(stream=out)
-    s = '20 B O 0 N '
+    s = '20 B O 20 B U 0 N '
    buf = ' ' * (columns - len(s))
    assert out.getvalue() == s + buf + "\r"

    out = StringIO()
-    stats.update(10 ** 3)
+    stats.update(10 ** 3, unique=False)
    stats.show_progress(item=Item(path='foo'), final=False, stream=out)
-    s = '1.02 kB O 0 N foo'
+    s = '1.02 kB O 20 B U 0 N foo'
    buf = ' ' * (columns - len(s))
    assert out.getvalue() == s + buf + "\r"
    out = StringIO()
    stats.show_progress(item=Item(path='foo'*40), final=False, stream=out)
-    s = '1.02 kB O 0 N foofoofoofoofoofoofoofoofoofoo...foofoofoofoofoofoofoofoofoofoofoo'
+    s = '1.02 kB O 20 B U 0 N foofoofoofoofoofoofoofoofo...foofoofoofoofoofoofoofoofoofoo'
    buf = ' ' * (columns - len(s))
    assert out.getvalue() == s + buf + "\r"


 def test_stats_format(stats):
    assert str(stats) == """\
-This archive:                   20 B"""
+This archive:                   20 B                 20 B"""
    s = f"{stats.osize_fmt}"
    assert s == "20 B"
    # kind of redundant, but id is variable so we can't match reliably
-    assert repr(stats) == f'<Statistics object at {id(stats):#x} (20)>'
+    assert repr(stats) == f'<Statistics object at {id(stats):#x} (20, 20)>'


 def test_stats_progress_json(stats):