Merge pull request #62 from ThomasWaldmann/chunker-params

Chunker params, fixes #16
This commit is contained in:
TW 2015-06-21 02:11:54 +02:00
commit a487e16c16
8 changed files with 169 additions and 33 deletions

View File

@ -21,12 +21,14 @@ from .helpers import parse_timestamp, Error, uid2user, user2uid, gid2group, grou
Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int
ITEMS_BUFFER = 1024 * 1024
CHUNK_MIN = 1024
CHUNK_MAX = 10 * 1024 * 1024
WINDOW_SIZE = 0xfff
CHUNK_MASK = 0xffff
ZEROS = b'\0' * CHUNK_MAX
CHUNK_MIN_EXP = 10 # 2**10 == 1kiB
CHUNK_MAX_EXP = 23 # 2**23 == 8MiB
HASH_WINDOW_SIZE = 0xfff # 4095B
HASH_MASK_BITS = 16 # results in ~64kiB chunks statistically
# defaults, use --chunker-params to override
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
@ -69,12 +71,12 @@ class DownloadPipeline:
class ChunkBuffer:
BUFFER_SIZE = 1 * 1024 * 1024
def __init__(self, key):
def __init__(self, key, chunker_params=CHUNKER_PARAMS):
self.buffer = BytesIO()
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = []
self.key = key
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
def add(self, item):
self.buffer.write(self.packer.pack(StableDict(item)))
@ -104,8 +106,8 @@ class ChunkBuffer:
class CacheChunkBuffer(ChunkBuffer):
def __init__(self, cache, key, stats):
super(CacheChunkBuffer, self).__init__(key)
def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
super(CacheChunkBuffer, self).__init__(key, chunker_params)
self.cache = cache
self.stats = stats
@ -127,7 +129,8 @@ class Archive:
def __init__(self, repository, key, manifest, name, cache=None, create=False,
checkpoint_interval=300, numeric_owner=False, progress=False):
checkpoint_interval=300, numeric_owner=False, progress=False,
chunker_params=CHUNKER_PARAMS):
self.cwd = os.getcwd()
self.key = key
self.repository = repository
@ -142,8 +145,8 @@ class Archive:
self.numeric_owner = numeric_owner
self.pipeline = DownloadPipeline(self.repository, self.key)
if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.time()
@ -158,6 +161,7 @@ class Archive:
raise self.DoesNotExist(name)
info = self.manifest.archives[name]
self.load(info[b'id'])
self.zeros = b'\0' * (1 << chunker_params[1])
def _load_meta(self, id):
data = self.key.decrypt(id, self.repository.get(id))
@ -286,7 +290,7 @@ class Archive:
with open(path, 'wb') as fd:
ids = [c[0] for c in item[b'chunks']]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
if sparse and ZEROS.startswith(data):
if sparse and self.zeros.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:

View File

@ -13,7 +13,7 @@ import textwrap
import traceback
from . import __version__
from .archive import Archive, ArchiveChecker
from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
from .repository import Repository
from .cache import Cache
from .key import key_creator
@ -21,7 +21,7 @@ from .helpers import Error, location_validator, format_time, format_file_size, \
format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
is_cachedir, bigint_to_int
is_cachedir, bigint_to_int, ChunkerParams
from .remote import RepositoryServer, RemoteRepository
@ -104,7 +104,8 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
cache = Cache(repository, key, manifest, do_files=args.cache_files)
archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
create=True, checkpoint_interval=args.checkpoint_interval,
numeric_owner=args.numeric_owner, progress=args.progress)
numeric_owner=args.numeric_owner, progress=args.progress,
chunker_params=args.chunker_params)
# Add cache dir to inode_skip list
skip_inodes = set()
try:
@ -625,6 +626,10 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
metavar='yyyy-mm-ddThh:mm:ss',
help='manually specify the archive creation date/time (UTC). '
'alternatively, give a reference file/directory.')
subparser.add_argument('--chunker-params', dest='chunker_params',
type=ChunkerParams, default=CHUNKER_PARAMS,
metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
subparser.add_argument('archive', metavar='ARCHIVE',
type=location_validator(archive=True),
help='archive to create')

View File

@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
cdef class Chunker:
cdef _Chunker *chunker
def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
def chunkify(self, fd, fh=-1):
"""

View File

@ -313,6 +313,11 @@ def timestamp(s):
raise ValueError
def ChunkerParams(s):
window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)
def is_cachedir(path):
"""Determines whether the specified path is a cache directory (and
therefore should potentially be excluded from the backup) according to

View File

@ -12,7 +12,7 @@ import unittest
from hashlib import sha256
from .. import xattr
from ..archive import Archive, ChunkBuffer, CHUNK_MAX
from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
from ..archiver import Archiver
from ..cache import Cache
from ..crypto import bytes_to_long, num_aes_blocks
@ -213,7 +213,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
sparse_support = sys.platform != 'darwin'
filename = os.path.join(self.input_path, 'sparse')
content = b'foobar'
hole_size = 5 * CHUNK_MAX # 5 full chunker buffers
hole_size = 5 * (1 << CHUNK_MAX_EXP) # 5 full chunker buffers
with open(filename, 'wb') as fd:
# create a file that has a hole at the beginning and end (if the
# OS and filesystem supports sparse files)

View File

@ -1,27 +1,27 @@
from io import BytesIO
from ..chunker import Chunker, buzhash, buzhash_update
from ..archive import CHUNK_MAX
from ..archive import CHUNK_MAX_EXP
from . import BaseTestCase
class ChunkerTestCase(BaseTestCase):
def test_chunkify(self):
data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)

View File

@ -0,0 +1,116 @@
About borg create --chunker-params
==================================
--chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE
CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and
maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP.
Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum.
HASH_MASK_BITS is the number of least-significant bits of the rolling hash
that need to be zero to trigger a chunk cut.
Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2
(this allows the rolling hash some freedom to make its cut at a place
determined by the windows contents rather than the min/max. chunk size).
Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size)
HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
Default: 4095B
Trying it out
=============
I backed up a VM directory to demonstrate how different chunker parameters
influence repo size, index size / chunk count, compression, deduplication.
repo-sm: ~64kiB chunks (16 bits chunk mask), min chunk size 1kiB (2^10B)
(these are attic / borg 0.23 internal defaults)
repo-lg: ~1MiB chunks (20 bits chunk mask), min chunk size 64kiB (2^16B)
repo-xl: 8MiB chunks (2^23B max chunk size), min chunk size 64kiB (2^16B).
The chunk mask bits was set to 31, so it (almost) never triggers.
This degrades the rolling hash based dedup to a fixed-offset dedup
as the cutting point is now (almost) always the end of the buffer
(at 2^23B == 8MiB).
The repo index size is an indicator for the RAM needs of Borg.
In this special case, the total RAM needs are about 2.1x the repo index size.
You see index size of repo-sm is 16x larger than of repo-lg, which corresponds
to the ratio of the different target chunk sizes.
Note: RAM needs were not a problem in this specific case (37GB data size).
But just imagine, you have 37TB of such data and much less than 42GB RAM,
then you'ld definitely want the "lg" chunker params so you only need
2.6GB RAM. Or even bigger chunks than shown for "lg" (see "xl").
You also see compression works better for larger chunks, as expected.
Duplication works worse for larger chunks, also as expected.
small chunks
============
$ borg info /extra/repo-sm::1
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 10,23,16,4095 /extra/repo-sm::1 /home/tw/win
Number of files: 3
Original size Compressed size Deduplicated size
This archive: 37.12 GB 14.81 GB 12.18 GB
All archives: 37.12 GB 14.81 GB 12.18 GB
Unique chunks Total chunks
Chunk index: 378374 487316
$ ls -l /extra/repo-sm/index*
-rw-rw-r-- 1 tw tw 20971538 Jun 20 23:39 index.2308
$ du -sk /extra/repo-sm
11930840 /extra/repo-sm
large chunks
============
$ borg info /extra/repo-lg::1
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,20,4095 /extra/repo-lg::1 /home/tw/win
Number of files: 3
Original size Compressed size Deduplicated size
This archive: 37.10 GB 14.60 GB 13.38 GB
All archives: 37.10 GB 14.60 GB 13.38 GB
Unique chunks Total chunks
Chunk index: 25889 29349
$ ls -l /extra/repo-lg/index*
-rw-rw-r-- 1 tw tw 1310738 Jun 20 23:10 index.2264
$ du -sk /extra/repo-lg
13073928 /extra/repo-lg
xl chunks
=========
(borg-env)tw@tux:~/w/borg$ borg info /extra/repo-xl::1
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,31,4095 /extra/repo-xl::1 /home/tw/win
Number of files: 3
Original size Compressed size Deduplicated size
This archive: 37.10 GB 14.59 GB 14.59 GB
All archives: 37.10 GB 14.59 GB 14.59 GB
Unique chunks Total chunks
Chunk index: 4319 4434
$ ls -l /extra/repo-xl/index*
-rw-rw-r-- 1 tw tw 327698 Jun 21 00:52 index.2011
$ du -sk /extra/repo-xl/
14253464 /extra/repo-xl/

View File

@ -50,6 +50,9 @@ Examples
NAME="root-`date +%Y-%m-%d`"
$ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints
# Backup huge files with little chunk management overhead
$ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs
.. include:: usage/extract.rst.inc