compute the deduplicated size before compression

so we do not need csize for it.
This commit is contained in:
Thomas Waldmann 2022-06-11 22:29:43 +02:00
parent 1fd571a4d0
commit 19dfbe5c5c
4 changed files with 40 additions and 27 deletions

View File

@ -58,38 +58,45 @@ class Statistics:
def __init__(self, output_json=False, iec=False):
self.output_json = output_json
self.iec = iec
self.osize = self.nfiles = 0
self.osize_parts = self.nfiles_parts = 0
self.osize = self.usize = self.nfiles = 0
self.osize_parts = self.usize_parts = self.nfiles_parts = 0
self.last_progress = 0 # timestamp when last progress was shown
def update(self, size, part=False):
def update(self, size, unique, part=False):
if not part:
self.osize += size
if unique:
self.usize += size
else:
self.osize_parts += size
if unique:
self.usize_parts += size
def __add__(self, other):
if not isinstance(other, Statistics):
raise TypeError('can only add Statistics objects')
stats = Statistics(self.output_json, self.iec)
stats.osize = self.osize + other.osize
stats.usize = self.usize + other.usize
stats.nfiles = self.nfiles + other.nfiles
stats.osize_parts = self.osize_parts + other.osize_parts
stats.usize_parts = self.usize_parts + other.usize_parts
stats.nfiles_parts = self.nfiles_parts + other.nfiles_parts
return stats
summary = "{label:15} {stats.osize_fmt:>20s}"
summary = "{label:15} {stats.osize_fmt:>20s} {stats.usize_fmt:>20s}"
def __str__(self):
return self.summary.format(stats=self, label='This archive:')
def __repr__(self):
return "<{cls} object at {hash:#x} ({self.osize})>".format(
return "<{cls} object at {hash:#x} ({self.osize}, {self.usize})>".format(
cls=type(self).__name__, hash=id(self), self=self)
def as_dict(self):
return {
'original_size': FileSize(self.osize, iec=self.iec),
'deduplicated_size': FileSize(self.usize, iec=self.iec),
'nfiles': self.nfiles,
}
@ -114,6 +121,10 @@ class Statistics:
def osize_fmt(self):
return format_file_size(self.osize, iec=self.iec)
@property
def usize_fmt(self):
return format_file_size(self.usize, iec=self.iec)
def show_progress(self, item=None, final=False, stream=None, dt=None):
now = time.monotonic()
if dt is None or now - self.last_progress > dt:
@ -134,7 +145,7 @@ class Statistics:
else:
columns, lines = get_terminal_size()
if not final:
msg = '{0.osize_fmt} O {0.nfiles} N '.format(self)
msg = '{0.osize_fmt} O {0.usize_fmt} U {0.nfiles} N '.format(self)
path = remove_surrogates(item.path) if item else ''
space = columns - swidth(msg)
if space < 12:

View File

@ -99,7 +99,7 @@ except BaseException:
assert EXIT_ERROR == 2, "EXIT_ERROR is not 2, as expected - fix assert AND exception handler right above this line."
STATS_HEADER = " Original size"
STATS_HEADER = " Original size Deduplicated size"
PURE_PYTHON_MSGPACK_WARNING = "Using a pure-python msgpack! This will result in lower performance."
@ -1797,8 +1797,8 @@ class Archiver:
Command line: {command_line}
Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
------------------------------------------------------------------------------
Original size
This archive: {stats[original_size]:>20s}
Original size Deduplicated size
This archive: {stats[original_size]:>20s} {stats[deduplicated_size]:>20s}
{cache}
""").strip().format(cache=cache, **info))
if self.exit_code:

View File

@ -406,7 +406,7 @@ class Cache:
class CacheStatsMixin:
str_format = """\
All archives: {0.total_size:>20s}
All archives: {0.total_size:>20s} {0.unique_size:>20s}
Unique chunks Total chunks
Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
@ -440,7 +440,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
def format_tuple(self):
stats = self.stats()
for field in ['total_size', ]:
for field in ['total_size', 'unique_size']:
stats[field] = format_file_size(stats[field], iec=self.iec)
return self.Summary(**stats)
@ -905,7 +905,7 @@ class LocalCache(CacheStatsMixin):
data = self.key.encrypt(id, chunk, compress=compress)
self.repository.put(id, data, wait=wait)
self.chunks.add(id, 1, size)
stats.update(size)
stats.update(size, not refcount)
return ChunkListEntry(id, size)
def seen_chunk(self, id, size=None):
@ -921,7 +921,7 @@ class LocalCache(CacheStatsMixin):
if not self.txn_active:
self.begin_txn()
count, _size = self.chunks.incref(id)
stats.update(_size, part=part)
stats.update(_size, False, part=part)
return ChunkListEntry(id, _size)
def chunk_decref(self, id, stats, wait=True, part=False):
@ -931,9 +931,9 @@ class LocalCache(CacheStatsMixin):
if count == 0:
del self.chunks[id]
self.repository.delete(id, wait=wait)
stats.update(-size, part=part)
stats.update(-size, True, part=part)
else:
stats.update(-size, part=part)
stats.update(-size, False, part=part)
def file_known_and_unchanged(self, hashed_path, path_hash, st):
"""
@ -1072,7 +1072,7 @@ Chunk index: {0.total_unique_chunks:20d} unknown"""
data = self.key.encrypt(id, chunk, compress=compress)
self.repository.put(id, data, wait=wait)
self.chunks.add(id, 1, size)
stats.update(size)
stats.update(size, not refcount)
return ChunkListEntry(id, size)
def seen_chunk(self, id, size=None):
@ -1094,7 +1094,7 @@ Chunk index: {0.total_unique_chunks:20d} unknown"""
# size or add_chunk); we can't add references to those (size=0 is invalid) and generally don't try to.
size = _size or size
assert size
stats.update(size, part=part)
stats.update(size, False, part=part)
return ChunkListEntry(id, size)
def chunk_decref(self, id, stats, wait=True, part=False):
@ -1104,9 +1104,9 @@ Chunk index: {0.total_unique_chunks:20d} unknown"""
if count == 0:
del self.chunks[id]
self.repository.delete(id, wait=wait)
stats.update(-size, part=part)
stats.update(-size, True, part=part)
else:
stats.update(-size, part=part)
stats.update(-size, False, part=part)
def commit(self):
if not self._txn_active:

View File

@ -19,44 +19,46 @@ from ..platform import uid2user, gid2group
@pytest.fixture()
def stats():
stats = Statistics()
stats.update(20)
stats.update(20, unique=True)
return stats
def test_stats_basic(stats):
assert stats.osize == 20
stats.update(20)
assert stats.usize == 20
stats.update(20, unique=False)
assert stats.osize == 40
assert stats.usize == 20
def tests_stats_progress(stats, monkeypatch, columns=80):
monkeypatch.setenv('COLUMNS', str(columns))
out = StringIO()
stats.show_progress(stream=out)
s = '20 B O 0 N '
s = '20 B O 20 B U 0 N '
buf = ' ' * (columns - len(s))
assert out.getvalue() == s + buf + "\r"
out = StringIO()
stats.update(10 ** 3)
stats.update(10 ** 3, unique=False)
stats.show_progress(item=Item(path='foo'), final=False, stream=out)
s = '1.02 kB O 0 N foo'
s = '1.02 kB O 20 B U 0 N foo'
buf = ' ' * (columns - len(s))
assert out.getvalue() == s + buf + "\r"
out = StringIO()
stats.show_progress(item=Item(path='foo'*40), final=False, stream=out)
s = '1.02 kB O 0 N foofoofoofoofoofoofoofoofoofoo...foofoofoofoofoofoofoofoofoofoofoo'
s = '1.02 kB O 20 B U 0 N foofoofoofoofoofoofoofoofo...foofoofoofoofoofoofoofoofoofoo'
buf = ' ' * (columns - len(s))
assert out.getvalue() == s + buf + "\r"
def test_stats_format(stats):
assert str(stats) == """\
This archive: 20 B"""
This archive: 20 B 20 B"""
s = f"{stats.osize_fmt}"
assert s == "20 B"
# kind of redundant, but id is variable so we can't match reliably
assert repr(stats) == f'<Statistics object at {id(stats):#x} (20)>'
assert repr(stats) == f'<Statistics object at {id(stats):#x} (20, 20)>'
def test_stats_progress_json(stats):