diff --git a/src/borg/testsuite/chunker_pytest.py b/src/borg/testsuite/chunker_pytest.py index 0b7a788a..ba4a2feb 100644 --- a/src/borg/testsuite/chunker_pytest.py +++ b/src/borg/testsuite/chunker_pytest.py @@ -5,7 +5,7 @@ import tempfile import pytest from .chunker import cf -from ..chunker import ChunkerFixed, sparsemap, has_seek_hole, ChunkerFailing +from ..chunker import Chunker, ChunkerFixed, sparsemap, has_seek_hole, ChunkerFailing from ..constants import * # NOQA BS = 4096 # fs block size @@ -151,3 +151,29 @@ def test_chunker_failing(): assert c1.data == data[:SIZE] assert c2.data == data[SIZE : 2 * SIZE] assert c3.data == data[2 * SIZE :] + + +def test_buzhash_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB + chunker = Chunker(0, min_exp, max_exp, mask, 4095) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum((int(size == 2**min_exp) for size in chunk_sizes)) + max_count = sum((int(size == 2**max_exp) for size in chunk_sizes)) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10