mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-21 23:33:07 +00:00
Merge pull request #8541 from ThomasWaldmann/incremental-chunkindex-cache-updates
enable partial/incremental ChunkIndex cache updates
This commit is contained in:
commit
b6ae924f30
5 changed files with 111 additions and 31 deletions
|
@ -58,13 +58,8 @@ def get_repository_chunks(self) -> ChunkIndex:
|
|||
return chunks
|
||||
|
||||
def save_chunk_index(self):
|
||||
# first clean up:
|
||||
for id, entry in self.chunks.iteritems():
|
||||
# we already deleted the unused chunks, so everything left must be used:
|
||||
assert entry.flags & ChunkIndex.F_USED
|
||||
# as we put the wrong size in there, we need to clean up the size:
|
||||
self.chunks[id] = entry._replace(size=0)
|
||||
# now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
|
||||
# write_chunkindex_to_repo now removes all flags and size infos.
|
||||
# we need this, as we put the wrong size in there.
|
||||
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
|
||||
self.chunks = None # nothing there (cleared!)
|
||||
|
||||
|
|
|
@ -396,9 +396,7 @@ def compress_entry(self, entry):
|
|||
assert isinstance(entry, FileCacheEntry)
|
||||
compressed_chunks = []
|
||||
for id, size in entry.chunks:
|
||||
cie = self.chunks.get(id)
|
||||
assert cie is not None
|
||||
assert cie.flags & ChunkIndex.F_USED
|
||||
cie = self.chunks[id] # may raise KeyError if chunk id is not in repo
|
||||
if cie.size == 0: # size is not known in the chunks index yet
|
||||
self.chunks[id] = cie._replace(size=size)
|
||||
else:
|
||||
|
@ -418,9 +416,7 @@ def decompress_entry(self, entry_packed):
|
|||
for idx in entry.chunks:
|
||||
assert isinstance(idx, int), f"{idx} is not an int"
|
||||
id = self.chunks.idx_to_k(idx)
|
||||
cie = self.chunks.get(id)
|
||||
assert cie is not None
|
||||
assert cie.flags & ChunkIndex.F_USED
|
||||
cie = self.chunks[id]
|
||||
assert cie.size > 0
|
||||
chunks.append((id, cie.size))
|
||||
entry = entry._replace(chunks=chunks)
|
||||
|
@ -485,6 +481,7 @@ def _build_files_cache(self):
|
|||
mtime=int_to_timestamp(mtime_ns),
|
||||
chunks=item.chunks,
|
||||
)
|
||||
# note: if the repo is an a valid state, next line should not fail with KeyError:
|
||||
files[path_hash] = self.compress_entry(entry)
|
||||
# deal with special snapshot / timestamp granularity case, see FAQ:
|
||||
for path_hash in self._newest_path_hashes:
|
||||
|
@ -529,7 +526,11 @@ def _read_files_cache(self):
|
|||
for path_hash, entry in u:
|
||||
entry = FileCacheEntry(*entry)
|
||||
entry = entry._replace(age=entry.age + 1)
|
||||
files[path_hash] = self.compress_entry(entry)
|
||||
try:
|
||||
files[path_hash] = self.compress_entry(entry)
|
||||
except KeyError:
|
||||
# repo is missing a chunk referenced from entry
|
||||
logger.debug(f"compress_entry failed for {entry}, skipping.")
|
||||
except (TypeError, ValueError) as exc:
|
||||
msg = "The files cache seems invalid. [%s]" % str(exc)
|
||||
break
|
||||
|
@ -706,14 +707,23 @@ def delete_chunkindex_cache(repository):
|
|||
def write_chunkindex_to_repo_cache(
|
||||
repository, chunks, *, clear=False, force_write=False, delete_other=False, delete_these=None
|
||||
):
|
||||
cached_hashes = list_chunkindex_hashes(repository)
|
||||
# the borghash code has no means to only serialize the F_NEW table entries,
|
||||
# thus we copy only the new entries to a temporary table:
|
||||
new_chunks = ChunkIndex()
|
||||
# for now, we don't want to serialize the flags or the size, just the keys (chunk IDs):
|
||||
cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0)
|
||||
for key, _ in chunks.iteritems(only_new=True):
|
||||
new_chunks[key] = cleaned_value
|
||||
with io.BytesIO() as f:
|
||||
chunks.write(f)
|
||||
new_chunks.write(f)
|
||||
data = f.getvalue()
|
||||
logger.debug(f"caching {len(new_chunks)} new chunks.")
|
||||
new_chunks.clear() # free memory of the temporary table
|
||||
if clear:
|
||||
# if we don't need the in-memory chunks index anymore:
|
||||
chunks.clear() # free memory, immediately
|
||||
new_hash = bin_to_hex(xxh64(data, seed=CHUNKINDEX_HASH_SEED))
|
||||
cached_hashes = list_chunkindex_hashes(repository)
|
||||
if force_write or new_hash not in cached_hashes:
|
||||
# when an updated chunks index is stored into the cache, we also store its hash as part of the name.
|
||||
# when a client is loading the chunks index from a cache, it has to compare its xxh64
|
||||
|
@ -725,12 +735,15 @@ def write_chunkindex_to_repo_cache(
|
|||
cache_name = f"cache/chunks.{new_hash}"
|
||||
logger.debug(f"caching chunks index as {cache_name} in repository...")
|
||||
repository.store_store(cache_name, data)
|
||||
# we have successfully stored to the repository, so we can clear all F_NEW flags now:
|
||||
chunks.clear_new()
|
||||
# delete some not needed cached chunk indexes, but never the one we just wrote:
|
||||
if delete_other:
|
||||
delete_these = cached_hashes
|
||||
delete_these = set(cached_hashes) - {new_hash}
|
||||
elif delete_these:
|
||||
pass
|
||||
delete_these = set(delete_these) - {new_hash}
|
||||
else:
|
||||
delete_these = []
|
||||
delete_these = set()
|
||||
for hash in delete_these:
|
||||
cache_name = f"cache/chunks.{hash}"
|
||||
try:
|
||||
|
@ -783,6 +796,8 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
|
|||
write_chunkindex_to_repo_cache(
|
||||
repository, chunks, clear=False, force_write=True, delete_these=hashes
|
||||
)
|
||||
else:
|
||||
chunks.clear_new()
|
||||
return chunks
|
||||
# if we didn't get anything from the cache, compute the ChunkIndex the slow way:
|
||||
logger.debug("querying the chunk IDs list from the repo...")
|
||||
|
@ -818,6 +833,8 @@ def __init__(self):
|
|||
self._chunks = None
|
||||
self.last_refresh_dt = datetime.now(timezone.utc)
|
||||
self.refresh_td = timedelta(seconds=60)
|
||||
self.chunks_cache_last_write = datetime.now(timezone.utc)
|
||||
self.chunks_cache_write_td = timedelta(seconds=600)
|
||||
|
||||
@property
|
||||
def chunks(self):
|
||||
|
@ -864,6 +881,7 @@ def add_chunk(
|
|||
else:
|
||||
raise ValueError("when giving compressed data for a chunk, the uncompressed size must be given also")
|
||||
now = datetime.now(timezone.utc)
|
||||
self._maybe_write_chunks_cache(now)
|
||||
exists = self.seen_chunk(id, size)
|
||||
if exists:
|
||||
# if borg create is processing lots of unchanged files (no content and not metadata changes),
|
||||
|
@ -879,10 +897,10 @@ def add_chunk(
|
|||
stats.update(size, not exists)
|
||||
return ChunkListEntry(id, size)
|
||||
|
||||
def _write_chunks_cache(self, chunks):
|
||||
# this is called from .close, so we can clear here:
|
||||
write_chunkindex_to_repo_cache(self.repository, self._chunks, clear=True)
|
||||
self._chunks = None # nothing there (cleared!)
|
||||
def _maybe_write_chunks_cache(self, now, force=False, clear=False):
|
||||
if force or now > self.chunks_cache_last_write + self.chunks_cache_write_td:
|
||||
write_chunkindex_to_repo_cache(self.repository, self._chunks, clear=clear)
|
||||
self.chunks_cache_last_write = now
|
||||
|
||||
def refresh_lock(self, now):
|
||||
if now > self.last_refresh_dt + self.refresh_td:
|
||||
|
@ -980,7 +998,9 @@ def close(self):
|
|||
for key, value in sorted(self._chunks.stats.items()):
|
||||
logger.debug(f"Chunks index stats: {key}: {value}")
|
||||
pi.output("Saving chunks cache")
|
||||
self._write_chunks_cache(self._chunks) # cache/chunks in repo has a different integrity mechanism
|
||||
# note: cache/chunks.* in repo has a different integrity mechanism
|
||||
self._maybe_write_chunks_cache(self._chunks, force=True, clear=True)
|
||||
self._chunks = None # nothing there (cleared!)
|
||||
pi.output("Saving cache config")
|
||||
self.cache_config.save(self.manifest)
|
||||
self.cache_config.close()
|
||||
|
|
|
@ -13,8 +13,12 @@ CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]
|
|||
class ChunkIndex:
|
||||
F_NONE: int
|
||||
F_USED: int
|
||||
F_NEW: int
|
||||
M_USER: int
|
||||
M_SYSTEM: int
|
||||
def add(self, key: bytes, size: int) -> None: ...
|
||||
def iteritems(self, marker: bytes = ...) -> Iterator: ...
|
||||
def iteritems(self, *, only_new: bool = ...) -> Iterator: ...
|
||||
def clear_new(self) -> None: ...
|
||||
def __contains__(self, key: bytes) -> bool: ...
|
||||
def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...
|
||||
def __setitem__(self, key: bytes, value: CIE) -> None: ...
|
||||
|
|
|
@ -39,11 +39,16 @@ ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
|
|||
|
||||
class ChunkIndex(HTProxyMixin, MutableMapping):
|
||||
"""
|
||||
Mapping from key256 to (refcount32, size32) to track chunks in the repository.
|
||||
Mapping from key256 to (flags32, size32) to track chunks in the repository.
|
||||
"""
|
||||
# .flags values: 2^0 .. 2^31
|
||||
# .flags related values:
|
||||
F_NONE = 0 # all flags cleared
|
||||
F_USED = 1 # chunk is used/referenced
|
||||
M_USER = 0x00ffffff # mask for user flags
|
||||
M_SYSTEM = 0xff000000 # mask for system flags
|
||||
# user flags:
|
||||
F_USED = 2 ** 0 # chunk is used/referenced
|
||||
# system flags (internal use, always 0 to user, not changeable by user):
|
||||
F_NEW = 2 ** 24 # a new chunk that is not present in repo/cache/chunks.* yet.
|
||||
|
||||
def __init__(self, capacity=1000, path=None, usable=None):
|
||||
if path:
|
||||
|
@ -53,8 +58,15 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
|
|||
capacity = usable * 2 # load factor 0.5
|
||||
self.ht = HashTableNT(key_size=32, value_format="<II", value_type=ChunkIndexEntry, capacity=capacity)
|
||||
|
||||
def iteritems(self):
|
||||
yield from self.ht.items()
|
||||
def hide_system_flags(self, value):
|
||||
user_flags = value.flags & self.M_USER
|
||||
return value._replace(flags=user_flags)
|
||||
|
||||
def iteritems(self, *, only_new=False):
|
||||
"""iterate items (optionally only new items), hide system flags."""
|
||||
for key, value in self.ht.items():
|
||||
if not only_new or (value.flags & self.F_NEW):
|
||||
yield key, self.hide_system_flags(value)
|
||||
|
||||
def add(self, key, size):
|
||||
v = self.get(key)
|
||||
|
@ -65,6 +77,36 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
|
|||
assert v.size == 0 or v.size == size
|
||||
self[key] = ChunkIndexEntry(flags=flags, size=size)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""specialized __getitem__ that hides system flags."""
|
||||
value = self.ht[key]
|
||||
return self.hide_system_flags(value)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""specialized __setitem__ that protects system flags, manages F_NEW flag."""
|
||||
try:
|
||||
prev = self.ht[key]
|
||||
except KeyError:
|
||||
prev_flags = self.F_NONE
|
||||
is_new = True
|
||||
else:
|
||||
prev_flags = prev.flags
|
||||
is_new = bool(prev_flags & self.F_NEW) # was new? stays new!
|
||||
system_flags = prev_flags & self.M_SYSTEM
|
||||
if is_new:
|
||||
system_flags |= self.F_NEW
|
||||
else:
|
||||
system_flags &= ~self.F_NEW
|
||||
user_flags = value.flags & self.M_USER
|
||||
self.ht[key] = value._replace(flags=system_flags | user_flags)
|
||||
|
||||
def clear_new(self):
|
||||
"""clear F_NEW flag of all items"""
|
||||
for key, value in self.ht.items():
|
||||
if value.flags & self.F_NEW:
|
||||
flags = value.flags & ~self.F_NEW
|
||||
self.ht[key] = value._replace(flags=flags)
|
||||
|
||||
@classmethod
|
||||
def read(cls, path):
|
||||
return cls(path=path)
|
||||
|
|
|
@ -35,4 +35,23 @@ def test_keyerror():
|
|||
with pytest.raises(KeyError):
|
||||
chunks[x]
|
||||
with pytest.raises(struct.error):
|
||||
chunks[x] = ChunkIndexEntry(flags=2**33, size=0)
|
||||
chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33)
|
||||
|
||||
|
||||
def test_new():
|
||||
def new_chunks():
|
||||
return list(chunks.iteritems(only_new=True))
|
||||
|
||||
chunks = ChunkIndex()
|
||||
key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23)
|
||||
key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42)
|
||||
# tracking of new entries
|
||||
assert new_chunks() == []
|
||||
chunks[key1] = value1a
|
||||
assert new_chunks() == [(key1, value1a)]
|
||||
chunks.clear_new()
|
||||
assert new_chunks() == []
|
||||
chunks[key2] = value2a
|
||||
assert new_chunks() == [(key2, value2a)]
|
||||
chunks.clear_new()
|
||||
assert new_chunks() == []
|
||||
|
|
Loading…
Reference in a new issue