mirror of https://github.com/borgbackup/borg.git
Merge pull request #62 from ThomasWaldmann/chunker-params
Chunker params, fixes #16
This commit is contained in:
commit
a487e16c16
|
@ -21,12 +21,14 @@ from .helpers import parse_timestamp, Error, uid2user, user2uid, gid2group, grou
|
|||
Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int
|
||||
|
||||
ITEMS_BUFFER = 1024 * 1024
|
||||
CHUNK_MIN = 1024
|
||||
CHUNK_MAX = 10 * 1024 * 1024
|
||||
WINDOW_SIZE = 0xfff
|
||||
CHUNK_MASK = 0xffff
|
||||
|
||||
ZEROS = b'\0' * CHUNK_MAX
|
||||
CHUNK_MIN_EXP = 10 # 2**10 == 1kiB
|
||||
CHUNK_MAX_EXP = 23 # 2**23 == 8MiB
|
||||
HASH_WINDOW_SIZE = 0xfff # 4095B
|
||||
HASH_MASK_BITS = 16 # results in ~64kiB chunks statistically
|
||||
|
||||
# defaults, use --chunker-params to override
|
||||
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||
|
||||
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
|
||||
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
|
||||
|
@ -69,12 +71,12 @@ class DownloadPipeline:
|
|||
class ChunkBuffer:
|
||||
BUFFER_SIZE = 1 * 1024 * 1024
|
||||
|
||||
def __init__(self, key):
|
||||
def __init__(self, key, chunker_params=CHUNKER_PARAMS):
|
||||
self.buffer = BytesIO()
|
||||
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
|
||||
self.chunks = []
|
||||
self.key = key
|
||||
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
|
||||
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
||||
|
||||
def add(self, item):
|
||||
self.buffer.write(self.packer.pack(StableDict(item)))
|
||||
|
@ -104,8 +106,8 @@ class ChunkBuffer:
|
|||
|
||||
class CacheChunkBuffer(ChunkBuffer):
|
||||
|
||||
def __init__(self, cache, key, stats):
|
||||
super(CacheChunkBuffer, self).__init__(key)
|
||||
def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
|
||||
super(CacheChunkBuffer, self).__init__(key, chunker_params)
|
||||
self.cache = cache
|
||||
self.stats = stats
|
||||
|
||||
|
@ -127,7 +129,8 @@ class Archive:
|
|||
|
||||
|
||||
def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
||||
checkpoint_interval=300, numeric_owner=False, progress=False):
|
||||
checkpoint_interval=300, numeric_owner=False, progress=False,
|
||||
chunker_params=CHUNKER_PARAMS):
|
||||
self.cwd = os.getcwd()
|
||||
self.key = key
|
||||
self.repository = repository
|
||||
|
@ -142,8 +145,8 @@ class Archive:
|
|||
self.numeric_owner = numeric_owner
|
||||
self.pipeline = DownloadPipeline(self.repository, self.key)
|
||||
if create:
|
||||
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
|
||||
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
|
||||
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
|
||||
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
||||
if name in manifest.archives:
|
||||
raise self.AlreadyExists(name)
|
||||
self.last_checkpoint = time.time()
|
||||
|
@ -158,6 +161,7 @@ class Archive:
|
|||
raise self.DoesNotExist(name)
|
||||
info = self.manifest.archives[name]
|
||||
self.load(info[b'id'])
|
||||
self.zeros = b'\0' * (1 << chunker_params[1])
|
||||
|
||||
def _load_meta(self, id):
|
||||
data = self.key.decrypt(id, self.repository.get(id))
|
||||
|
@ -286,7 +290,7 @@ class Archive:
|
|||
with open(path, 'wb') as fd:
|
||||
ids = [c[0] for c in item[b'chunks']]
|
||||
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
|
||||
if sparse and ZEROS.startswith(data):
|
||||
if sparse and self.zeros.startswith(data):
|
||||
# all-zero chunk: create a hole in a sparse file
|
||||
fd.seek(len(data), 1)
|
||||
else:
|
||||
|
|
|
@ -13,7 +13,7 @@ import textwrap
|
|||
import traceback
|
||||
|
||||
from . import __version__
|
||||
from .archive import Archive, ArchiveChecker
|
||||
from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS
|
||||
from .repository import Repository
|
||||
from .cache import Cache
|
||||
from .key import key_creator
|
||||
|
@ -21,7 +21,7 @@ from .helpers import Error, location_validator, format_time, format_file_size, \
|
|||
format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
|
||||
get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
|
||||
Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
|
||||
is_cachedir, bigint_to_int
|
||||
is_cachedir, bigint_to_int, ChunkerParams
|
||||
from .remote import RepositoryServer, RemoteRepository
|
||||
|
||||
|
||||
|
@ -104,7 +104,8 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
|
|||
cache = Cache(repository, key, manifest, do_files=args.cache_files)
|
||||
archive = Archive(repository, key, manifest, args.archive.archive, cache=cache,
|
||||
create=True, checkpoint_interval=args.checkpoint_interval,
|
||||
numeric_owner=args.numeric_owner, progress=args.progress)
|
||||
numeric_owner=args.numeric_owner, progress=args.progress,
|
||||
chunker_params=args.chunker_params)
|
||||
# Add cache dir to inode_skip list
|
||||
skip_inodes = set()
|
||||
try:
|
||||
|
@ -625,6 +626,10 @@ Type "Yes I am sure" if you understand this and want to continue.\n""")
|
|||
metavar='yyyy-mm-ddThh:mm:ss',
|
||||
help='manually specify the archive creation date/time (UTC). '
|
||||
'alternatively, give a reference file/directory.')
|
||||
subparser.add_argument('--chunker-params', dest='chunker_params',
|
||||
type=ChunkerParams, default=CHUNKER_PARAMS,
|
||||
metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE',
|
||||
help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS)
|
||||
subparser.add_argument('archive', metavar='ARCHIVE',
|
||||
type=location_validator(archive=True),
|
||||
help='archive to create')
|
||||
|
|
|
@ -20,8 +20,11 @@ cdef extern from "_chunker.c":
|
|||
cdef class Chunker:
|
||||
cdef _Chunker *chunker
|
||||
|
||||
def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
|
||||
self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
|
||||
def __cinit__(self, seed, chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size):
|
||||
min_size = 1 << chunk_min_exp
|
||||
max_size = 1 << chunk_max_exp
|
||||
hash_mask = (1 << hash_mask_bits) - 1
|
||||
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
|
||||
|
||||
def chunkify(self, fd, fh=-1):
|
||||
"""
|
||||
|
|
|
@ -313,6 +313,11 @@ def timestamp(s):
|
|||
raise ValueError
|
||||
|
||||
|
||||
def ChunkerParams(s):
|
||||
window_size, chunk_mask, chunk_min, chunk_max = s.split(',')
|
||||
return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max)
|
||||
|
||||
|
||||
def is_cachedir(path):
|
||||
"""Determines whether the specified path is a cache directory (and
|
||||
therefore should potentially be excluded from the backup) according to
|
||||
|
|
|
@ -12,7 +12,7 @@ import unittest
|
|||
from hashlib import sha256
|
||||
|
||||
from .. import xattr
|
||||
from ..archive import Archive, ChunkBuffer, CHUNK_MAX
|
||||
from ..archive import Archive, ChunkBuffer, CHUNK_MAX_EXP
|
||||
from ..archiver import Archiver
|
||||
from ..cache import Cache
|
||||
from ..crypto import bytes_to_long, num_aes_blocks
|
||||
|
@ -213,7 +213,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
|
|||
sparse_support = sys.platform != 'darwin'
|
||||
filename = os.path.join(self.input_path, 'sparse')
|
||||
content = b'foobar'
|
||||
hole_size = 5 * CHUNK_MAX # 5 full chunker buffers
|
||||
hole_size = 5 * (1 << CHUNK_MAX_EXP) # 5 full chunker buffers
|
||||
with open(filename, 'wb') as fd:
|
||||
# create a file that has a hole at the beginning and end (if the
|
||||
# OS and filesystem supports sparse files)
|
||||
|
|
|
@ -1,27 +1,27 @@
|
|||
from io import BytesIO
|
||||
|
||||
from ..chunker import Chunker, buzhash, buzhash_update
|
||||
from ..archive import CHUNK_MAX
|
||||
from ..archive import CHUNK_MAX_EXP
|
||||
from . import BaseTestCase
|
||||
|
||||
|
||||
class ChunkerTestCase(BaseTestCase):
|
||||
|
||||
def test_chunkify(self):
|
||||
data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
|
||||
parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
|
||||
data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
|
||||
parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
|
||||
self.assert_equal(len(parts), 2)
|
||||
self.assert_equal(b''.join(parts), data)
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
|
||||
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
|
||||
self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
|
||||
self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
|
||||
self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
|
||||
|
||||
def test_buzhash(self):
|
||||
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
About borg create --chunker-params
|
||||
==================================
|
||||
|
||||
--chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE
|
||||
|
||||
CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and
|
||||
maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP.
|
||||
|
||||
Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum.
|
||||
|
||||
HASH_MASK_BITS is the number of least-significant bits of the rolling hash
|
||||
that need to be zero to trigger a chunk cut.
|
||||
Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2
|
||||
(this allows the rolling hash some freedom to make its cut at a place
|
||||
determined by the windows contents rather than the min/max. chunk size).
|
||||
|
||||
Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size)
|
||||
|
||||
HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation.
|
||||
Default: 4095B
|
||||
|
||||
|
||||
Trying it out
|
||||
=============
|
||||
|
||||
I backed up a VM directory to demonstrate how different chunker parameters
|
||||
influence repo size, index size / chunk count, compression, deduplication.
|
||||
|
||||
repo-sm: ~64kiB chunks (16 bits chunk mask), min chunk size 1kiB (2^10B)
|
||||
(these are attic / borg 0.23 internal defaults)
|
||||
|
||||
repo-lg: ~1MiB chunks (20 bits chunk mask), min chunk size 64kiB (2^16B)
|
||||
|
||||
repo-xl: 8MiB chunks (2^23B max chunk size), min chunk size 64kiB (2^16B).
|
||||
The chunk mask bits was set to 31, so it (almost) never triggers.
|
||||
This degrades the rolling hash based dedup to a fixed-offset dedup
|
||||
as the cutting point is now (almost) always the end of the buffer
|
||||
(at 2^23B == 8MiB).
|
||||
|
||||
The repo index size is an indicator for the RAM needs of Borg.
|
||||
In this special case, the total RAM needs are about 2.1x the repo index size.
|
||||
You see index size of repo-sm is 16x larger than of repo-lg, which corresponds
|
||||
to the ratio of the different target chunk sizes.
|
||||
|
||||
Note: RAM needs were not a problem in this specific case (37GB data size).
|
||||
But just imagine, you have 37TB of such data and much less than 42GB RAM,
|
||||
then you'ld definitely want the "lg" chunker params so you only need
|
||||
2.6GB RAM. Or even bigger chunks than shown for "lg" (see "xl").
|
||||
|
||||
You also see compression works better for larger chunks, as expected.
|
||||
Duplication works worse for larger chunks, also as expected.
|
||||
|
||||
small chunks
|
||||
============
|
||||
|
||||
$ borg info /extra/repo-sm::1
|
||||
|
||||
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 10,23,16,4095 /extra/repo-sm::1 /home/tw/win
|
||||
Number of files: 3
|
||||
|
||||
Original size Compressed size Deduplicated size
|
||||
This archive: 37.12 GB 14.81 GB 12.18 GB
|
||||
All archives: 37.12 GB 14.81 GB 12.18 GB
|
||||
|
||||
Unique chunks Total chunks
|
||||
Chunk index: 378374 487316
|
||||
|
||||
$ ls -l /extra/repo-sm/index*
|
||||
|
||||
-rw-rw-r-- 1 tw tw 20971538 Jun 20 23:39 index.2308
|
||||
|
||||
$ du -sk /extra/repo-sm
|
||||
11930840 /extra/repo-sm
|
||||
|
||||
large chunks
|
||||
============
|
||||
|
||||
$ borg info /extra/repo-lg::1
|
||||
|
||||
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,20,4095 /extra/repo-lg::1 /home/tw/win
|
||||
Number of files: 3
|
||||
|
||||
Original size Compressed size Deduplicated size
|
||||
This archive: 37.10 GB 14.60 GB 13.38 GB
|
||||
All archives: 37.10 GB 14.60 GB 13.38 GB
|
||||
|
||||
Unique chunks Total chunks
|
||||
Chunk index: 25889 29349
|
||||
|
||||
$ ls -l /extra/repo-lg/index*
|
||||
|
||||
-rw-rw-r-- 1 tw tw 1310738 Jun 20 23:10 index.2264
|
||||
|
||||
$ du -sk /extra/repo-lg
|
||||
13073928 /extra/repo-lg
|
||||
|
||||
xl chunks
|
||||
=========
|
||||
|
||||
(borg-env)tw@tux:~/w/borg$ borg info /extra/repo-xl::1
|
||||
Command line: /home/tw/w/borg-env/bin/borg create --chunker-params 16,23,31,4095 /extra/repo-xl::1 /home/tw/win
|
||||
Number of files: 3
|
||||
|
||||
Original size Compressed size Deduplicated size
|
||||
This archive: 37.10 GB 14.59 GB 14.59 GB
|
||||
All archives: 37.10 GB 14.59 GB 14.59 GB
|
||||
|
||||
Unique chunks Total chunks
|
||||
Chunk index: 4319 4434
|
||||
|
||||
$ ls -l /extra/repo-xl/index*
|
||||
-rw-rw-r-- 1 tw tw 327698 Jun 21 00:52 index.2011
|
||||
|
||||
$ du -sk /extra/repo-xl/
|
||||
14253464 /extra/repo-xl/
|
||||
|
|
@ -50,6 +50,9 @@ Examples
|
|||
NAME="root-`date +%Y-%m-%d`"
|
||||
$ borg create /mnt/backup::$NAME / --do-not-cross-mountpoints
|
||||
|
||||
# Backup huge files with little chunk management overhead
|
||||
$ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs
|
||||
|
||||
|
||||
.. include:: usage/extract.rst.inc
|
||||
|
||||
|
|
Loading…
Reference in New Issue