1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-21 23:33:07 +00:00

ChunkIndex: enable partial index updates

- ChunkIndex: implement system flags
- ChunkIndex: F_NEW flag as 1st system flag for newly added chunks
- incrementally write only NEW chunks to repo/cache/chunks.*
- merge all chunks.* when loading the ChunkIndex from the repo

Also: the cached ChunkIndex only has the chunk IDs. All values are just dummies.
The ChunkIndexEntry value can be used to set flags and track size, but we
intentionally do not persist flags and size to the cache.

The size information gets set when borg loads the files cache and "compresses"
the chunks lists in the files cache entries. After that, all chunks referenced
by the files cache will have a valid size as long as the ChunkIndex is in memory.
This is needed so that "uncompress" can work.
This commit is contained in:
Thomas Waldmann 2024-11-13 01:16:55 +01:00
parent 4ed03c17e6
commit d5d49e8a15
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
5 changed files with 101 additions and 26 deletions

View file

@ -58,13 +58,8 @@ def get_repository_chunks(self) -> ChunkIndex:
return chunks
def save_chunk_index(self):
# first clean up:
for id, entry in self.chunks.iteritems():
# we already deleted the unused chunks, so everything left must be used:
assert entry.flags & ChunkIndex.F_USED
# as we put the wrong size in there, we need to clean up the size:
self.chunks[id] = entry._replace(size=0)
# now self.chunks is an uptodate ChunkIndex, usable for general borg usage!
# write_chunkindex_to_repo now removes all flags and size infos.
# we need this, as we put the wrong size in there.
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
self.chunks = None # nothing there (cleared!)

View file

@ -396,9 +396,7 @@ def compress_entry(self, entry):
assert isinstance(entry, FileCacheEntry)
compressed_chunks = []
for id, size in entry.chunks:
cie = self.chunks.get(id)
assert cie is not None
assert cie.flags & ChunkIndex.F_USED
cie = self.chunks[id] # may raise KeyError if chunk id is not in repo
if cie.size == 0: # size is not known in the chunks index yet
self.chunks[id] = cie._replace(size=size)
else:
@ -418,9 +416,7 @@ def decompress_entry(self, entry_packed):
for idx in entry.chunks:
assert isinstance(idx, int), f"{idx} is not an int"
id = self.chunks.idx_to_k(idx)
cie = self.chunks.get(id)
assert cie is not None
assert cie.flags & ChunkIndex.F_USED
cie = self.chunks[id]
assert cie.size > 0
chunks.append((id, cie.size))
entry = entry._replace(chunks=chunks)
@ -485,6 +481,7 @@ def _build_files_cache(self):
mtime=int_to_timestamp(mtime_ns),
chunks=item.chunks,
)
# note: if the repo is an a valid state, next line should not fail with KeyError:
files[path_hash] = self.compress_entry(entry)
# deal with special snapshot / timestamp granularity case, see FAQ:
for path_hash in self._newest_path_hashes:
@ -529,7 +526,11 @@ def _read_files_cache(self):
for path_hash, entry in u:
entry = FileCacheEntry(*entry)
entry = entry._replace(age=entry.age + 1)
files[path_hash] = self.compress_entry(entry)
try:
files[path_hash] = self.compress_entry(entry)
except KeyError:
# repo is missing a chunk referenced from entry
logger.debug(f"compress_entry failed for {entry}, skipping.")
except (TypeError, ValueError) as exc:
msg = "The files cache seems invalid. [%s]" % str(exc)
break
@ -706,14 +707,23 @@ def delete_chunkindex_cache(repository):
def write_chunkindex_to_repo_cache(
repository, chunks, *, clear=False, force_write=False, delete_other=False, delete_these=None
):
cached_hashes = list_chunkindex_hashes(repository)
# the borghash code has no means to only serialize the F_NEW table entries,
# thus we copy only the new entries to a temporary table:
new_chunks = ChunkIndex()
# for now, we don't want to serialize the flags or the size, just the keys (chunk IDs):
cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0)
for key, _ in chunks.iteritems(only_new=True):
new_chunks[key] = cleaned_value
with io.BytesIO() as f:
chunks.write(f)
new_chunks.write(f)
data = f.getvalue()
logger.debug(f"caching {len(new_chunks)} new chunks.")
new_chunks.clear() # free memory of the temporary table
if clear:
# if we don't need the in-memory chunks index anymore:
chunks.clear() # free memory, immediately
new_hash = bin_to_hex(xxh64(data, seed=CHUNKINDEX_HASH_SEED))
cached_hashes = list_chunkindex_hashes(repository)
if force_write or new_hash not in cached_hashes:
# when an updated chunks index is stored into the cache, we also store its hash as part of the name.
# when a client is loading the chunks index from a cache, it has to compare its xxh64
@ -725,12 +735,15 @@ def write_chunkindex_to_repo_cache(
cache_name = f"cache/chunks.{new_hash}"
logger.debug(f"caching chunks index as {cache_name} in repository...")
repository.store_store(cache_name, data)
# we have successfully stored to the repository, so we can clear all F_NEW flags now:
chunks.clear_new()
# delete some not needed cached chunk indexes, but never the one we just wrote:
if delete_other:
delete_these = cached_hashes
delete_these = set(cached_hashes) - {new_hash}
elif delete_these:
pass
delete_these = set(delete_these) - {new_hash}
else:
delete_these = []
delete_these = set()
for hash in delete_these:
cache_name = f"cache/chunks.{hash}"
try:
@ -783,6 +796,8 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
write_chunkindex_to_repo_cache(
repository, chunks, clear=False, force_write=True, delete_these=hashes
)
else:
chunks.clear_new()
return chunks
# if we didn't get anything from the cache, compute the ChunkIndex the slow way:
logger.debug("querying the chunk IDs list from the repo...")

View file

@ -13,8 +13,12 @@ CIE = Union[Tuple[int, int], Type[ChunkIndexEntry]]
class ChunkIndex:
F_NONE: int
F_USED: int
F_NEW: int
M_USER: int
M_SYSTEM: int
def add(self, key: bytes, size: int) -> None: ...
def iteritems(self, marker: bytes = ...) -> Iterator: ...
def iteritems(self, *, only_new: bool = ...) -> Iterator: ...
def clear_new(self) -> None: ...
def __contains__(self, key: bytes) -> bool: ...
def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ...
def __setitem__(self, key: bytes, value: CIE) -> None: ...

View file

@ -39,11 +39,16 @@ ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
class ChunkIndex(HTProxyMixin, MutableMapping):
"""
Mapping from key256 to (refcount32, size32) to track chunks in the repository.
Mapping from key256 to (flags32, size32) to track chunks in the repository.
"""
# .flags values: 2^0 .. 2^31
# .flags related values:
F_NONE = 0 # all flags cleared
F_USED = 1 # chunk is used/referenced
M_USER = 0x00ffffff # mask for user flags
M_SYSTEM = 0xff000000 # mask for system flags
# user flags:
F_USED = 2 ** 0 # chunk is used/referenced
# system flags (internal use, always 0 to user, not changeable by user):
F_NEW = 2 ** 24 # a new chunk that is not present in repo/cache/chunks.* yet.
def __init__(self, capacity=1000, path=None, usable=None):
if path:
@ -53,8 +58,15 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
capacity = usable * 2 # load factor 0.5
self.ht = HashTableNT(key_size=32, value_format="<II", value_type=ChunkIndexEntry, capacity=capacity)
def iteritems(self):
yield from self.ht.items()
def hide_system_flags(self, value):
user_flags = value.flags & self.M_USER
return value._replace(flags=user_flags)
def iteritems(self, *, only_new=False):
"""iterate items (optionally only new items), hide system flags."""
for key, value in self.ht.items():
if not only_new or (value.flags & self.F_NEW):
yield key, self.hide_system_flags(value)
def add(self, key, size):
v = self.get(key)
@ -65,6 +77,36 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
assert v.size == 0 or v.size == size
self[key] = ChunkIndexEntry(flags=flags, size=size)
def __getitem__(self, key):
"""specialized __getitem__ that hides system flags."""
value = self.ht[key]
return self.hide_system_flags(value)
def __setitem__(self, key, value):
"""specialized __setitem__ that protects system flags, manages F_NEW flag."""
try:
prev = self.ht[key]
except KeyError:
prev_flags = self.F_NONE
is_new = True
else:
prev_flags = prev.flags
is_new = bool(prev_flags & self.F_NEW) # was new? stays new!
system_flags = prev_flags & self.M_SYSTEM
if is_new:
system_flags |= self.F_NEW
else:
system_flags &= ~self.F_NEW
user_flags = value.flags & self.M_USER
self.ht[key] = value._replace(flags=system_flags | user_flags)
def clear_new(self):
"""clear F_NEW flag of all items"""
for key, value in self.ht.items():
if value.flags & self.F_NEW:
flags = value.flags & ~self.F_NEW
self.ht[key] = value._replace(flags=flags)
@classmethod
def read(cls, path):
return cls(path=path)

View file

@ -35,4 +35,23 @@ def test_keyerror():
with pytest.raises(KeyError):
chunks[x]
with pytest.raises(struct.error):
chunks[x] = ChunkIndexEntry(flags=2**33, size=0)
chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33)
def test_new():
def new_chunks():
return list(chunks.iteritems(only_new=True))
chunks = ChunkIndex()
key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23)
key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42)
# tracking of new entries
assert new_chunks() == []
chunks[key1] = value1a
assert new_chunks() == [(key1, value1a)]
chunks.clear_new()
assert new_chunks() == []
chunks[key2] = value2a
assert new_chunks() == [(key2, value2a)]
chunks.clear_new()
assert new_chunks() == []