mirror of
https://github.com/borgbackup/borg.git
synced 2025-01-31 11:42:05 +00:00
the items metadata stream is usually not that big (compared to the file content data) - it is just file and dir names and other metadata. if we use too rough granularity there (and big minimum chunk size), we usually will get no deduplication.
This commit is contained in:
parent
d08c51bdfc
commit
888e078382
2 changed files with 11 additions and 6 deletions
|
@ -34,6 +34,9 @@
|
||||||
# defaults, use --chunker-params to override
|
# defaults, use --chunker-params to override
|
||||||
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
|
||||||
|
|
||||||
|
# chunker params for the items metadata stream, finer granularity
|
||||||
|
ITEMS_CHUNKER_PARAMS = (12, 16, 14, HASH_WINDOW_SIZE)
|
||||||
|
|
||||||
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
|
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
|
||||||
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
|
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
|
||||||
has_mtime_ns = sys.version >= '3.3'
|
has_mtime_ns = sys.version >= '3.3'
|
||||||
|
@ -75,7 +78,7 @@ def fetch_many(self, ids, is_preloaded=False):
|
||||||
class ChunkBuffer:
|
class ChunkBuffer:
|
||||||
BUFFER_SIZE = 1 * 1024 * 1024
|
BUFFER_SIZE = 1 * 1024 * 1024
|
||||||
|
|
||||||
def __init__(self, key, chunker_params=CHUNKER_PARAMS):
|
def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
|
||||||
self.buffer = BytesIO()
|
self.buffer = BytesIO()
|
||||||
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
|
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
|
||||||
self.chunks = []
|
self.chunks = []
|
||||||
|
@ -110,7 +113,7 @@ def is_full(self):
|
||||||
|
|
||||||
class CacheChunkBuffer(ChunkBuffer):
|
class CacheChunkBuffer(ChunkBuffer):
|
||||||
|
|
||||||
def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
|
def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS):
|
||||||
super().__init__(key, chunker_params)
|
super().__init__(key, chunker_params)
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.stats = stats
|
self.stats = stats
|
||||||
|
@ -150,7 +153,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
||||||
self.end = end
|
self.end = end
|
||||||
self.pipeline = DownloadPipeline(self.repository, self.key)
|
self.pipeline = DownloadPipeline(self.repository, self.key)
|
||||||
if create:
|
if create:
|
||||||
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
|
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
|
||||||
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
|
||||||
if name in manifest.archives:
|
if name in manifest.archives:
|
||||||
raise self.AlreadyExists(name)
|
raise self.AlreadyExists(name)
|
||||||
|
|
|
@ -190,9 +190,11 @@ Each item represents a file, directory or other fs item and is stored as an
|
||||||
it and it is reset every time an inode's metadata is changed.
|
it and it is reset every time an inode's metadata is changed.
|
||||||
|
|
||||||
All items are serialized using msgpack and the resulting byte stream
|
All items are serialized using msgpack and the resulting byte stream
|
||||||
is fed into the same chunker used for regular file data and turned
|
is fed into the same chunker algorithm as used for regular file data
|
||||||
into deduplicated chunks. The reference to these chunks is then added
|
and turned into deduplicated chunks. The reference to these chunks is then added
|
||||||
to the archive metadata.
|
to the archive metadata. To achieve a finer granularity on this metadata
|
||||||
|
stream, we use different chunker params for this chunker, which result in
|
||||||
|
smaller chunks.
|
||||||
|
|
||||||
A chunk is stored as an object as well, of course.
|
A chunk is stored as an object as well, of course.
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue