1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2025-01-31 11:42:05 +00:00

use finer chunker granularity for items metadata stream, fixes #547, fixes #487

the items metadata stream is usually not that big (compared to the file content data) -
it is just file and dir names and other metadata.

if we use too rough granularity there (and big minimum chunk size), we usually will get no deduplication.
This commit is contained in:
Thomas Waldmann 2016-01-15 20:56:21 +01:00
parent d08c51bdfc
commit 888e078382
2 changed files with 11 additions and 6 deletions

View file

@ -34,6 +34,9 @@
# defaults, use --chunker-params to override # defaults, use --chunker-params to override
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
# chunker params for the items metadata stream, finer granularity
ITEMS_CHUNKER_PARAMS = (12, 16, 14, HASH_WINDOW_SIZE)
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {}) utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {}) utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
has_mtime_ns = sys.version >= '3.3' has_mtime_ns = sys.version >= '3.3'
@ -75,7 +78,7 @@ def fetch_many(self, ids, is_preloaded=False):
class ChunkBuffer: class ChunkBuffer:
BUFFER_SIZE = 1 * 1024 * 1024 BUFFER_SIZE = 1 * 1024 * 1024
def __init__(self, key, chunker_params=CHUNKER_PARAMS): def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
self.buffer = BytesIO() self.buffer = BytesIO()
self.packer = msgpack.Packer(unicode_errors='surrogateescape') self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = [] self.chunks = []
@ -110,7 +113,7 @@ def is_full(self):
class CacheChunkBuffer(ChunkBuffer): class CacheChunkBuffer(ChunkBuffer):
def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS): def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS):
super().__init__(key, chunker_params) super().__init__(key, chunker_params)
self.cache = cache self.cache = cache
self.stats = stats self.stats = stats
@ -150,7 +153,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
self.end = end self.end = end
self.pipeline = DownloadPipeline(self.repository, self.key) self.pipeline = DownloadPipeline(self.repository, self.key)
if create: if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params) self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params) self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
if name in manifest.archives: if name in manifest.archives:
raise self.AlreadyExists(name) raise self.AlreadyExists(name)

View file

@ -190,9 +190,11 @@ Each item represents a file, directory or other fs item and is stored as an
it and it is reset every time an inode's metadata is changed. it and it is reset every time an inode's metadata is changed.
All items are serialized using msgpack and the resulting byte stream All items are serialized using msgpack and the resulting byte stream
is fed into the same chunker used for regular file data and turned is fed into the same chunker algorithm as used for regular file data
into deduplicated chunks. The reference to these chunks is then added and turned into deduplicated chunks. The reference to these chunks is then added
to the archive metadata. to the archive metadata. To achieve a finer granularity on this metadata
stream, we use different chunker params for this chunker, which result in
smaller chunks.
A chunk is stored as an object as well, of course. A chunk is stored as an object as well, of course.