files cache: add chunk size information

the files cache used to have only the chunk ids,
so it had to rely on the chunks index having the
size information - which is problematic with e.g.
the AdhocCache (has size==0 for all not new chunks) and blocked using the files cache there.
This commit is contained in:
Thomas Waldmann 2023-09-19 22:47:15 +02:00
parent 411c763fb8
commit c5e130d03d
No known key found for this signature in database
GPG Key ID: 243ACFA951F78E01
2 changed files with 20 additions and 17 deletions

View File

@ -1552,25 +1552,27 @@ class FilesystemObjectProcessors:
started_hashing = time.monotonic()
path_hash = self.key.id_hash(hashed_path)
self.stats.hashing_time += time.monotonic() - started_hashing
known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
known, chunks = cache.file_known_and_unchanged(hashed_path, path_hash, st)
else:
# in --read-special mode, we may be called for special files.
# there should be no information in the cache about special files processed in
# read-special mode, but we better play safe as this was wrong in the past:
hashed_path = path_hash = None
known, ids = False, None
if ids is not None:
known, chunks = False, None
if chunks is not None:
# Make sure all ids are available
for id_ in ids:
if not cache.seen_chunk(id_):
for chunk in chunks:
if not cache.seen_chunk(chunk.id):
# cache said it is unmodified, but we lost a chunk: process file like modified
status = "M"
break
else:
item.chunks = []
for chunk_id in ids:
for chunk in chunks:
# process one-by-one, so we will know in item.chunks how far we got
chunk_entry = cache.chunk_incref(chunk_id, self.stats)
chunk_entry = cache.chunk_incref(chunk.id, self.stats)
# chunk.size is from files cache, chunk_entry.size from index:
assert chunk == chunk_entry
item.chunks.append(chunk_entry)
status = "U" # regular file, unchanged
else:
@ -1606,7 +1608,7 @@ class FilesystemObjectProcessors:
# block or char device will change without its mtime/size/inode changing.
# also, we must not memorize a potentially inconsistent/corrupt file that
# changed while we backed it up.
cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks])
cache.memorize_file(hashed_path, path_hash, st, item.chunks)
self.stats.files_stats[status] += 1 # must be done late
if not changed_while_backup:
status = None # we already called print_file_status

View File

@ -35,8 +35,8 @@ from .platform import SaveFile
from .remote import cache_if_remote
from .repository import LIST_SCAN_LIMIT
# note: cmtime might me either a ctime or a mtime timestamp
FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunk_ids")
# note: cmtime might be either a ctime or a mtime timestamp, chunks is a list of ChunkListEntry
FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunks")
class SecurityManager:
@ -1016,8 +1016,8 @@ class LocalCache(CacheStatsMixin):
:param hashed_path: the file's path as we gave it to hash(hashed_path)
:param path_hash: hash(hashed_path), to save some memory in the files cache
:param st: the file's stat() result
:return: known, ids (known is True if we have infos about this file in the cache,
ids is the list of chunk ids IF the file has not changed, otherwise None).
:return: known, chunks (known is True if we have infos about this file in the cache,
chunks is a list[ChunkListEntry] IF the file has not changed, otherwise None).
"""
if not stat.S_ISREG(st.st_mode):
return False, None
@ -1058,9 +1058,10 @@ class LocalCache(CacheStatsMixin):
# again at that time), we need to update the inode number in the cache with what
# we see in the filesystem.
self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
return True, entry.chunk_ids
chunks = [ChunkListEntry(*chunk) for chunk in entry.chunks] # convert to list of namedtuple
return True, chunks
def memorize_file(self, hashed_path, path_hash, st, ids):
def memorize_file(self, hashed_path, path_hash, st, chunks):
if not stat.S_ISREG(st.st_mode):
return
cache_mode = self.cache_mode
@ -1078,13 +1079,13 @@ class LocalCache(CacheStatsMixin):
cmtime_type = "ctime"
cmtime_ns = safe_ns(st.st_ctime_ns)
entry = FileCacheEntry(
age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids
age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunks=chunks
)
self.files[path_hash] = msgpack.packb(entry)
self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
files_cache_logger.debug(
"FILES-CACHE-UPDATE: put %r [has %s] <- %r",
entry._replace(chunk_ids="[%d entries]" % len(entry.chunk_ids)),
entry._replace(chunks="[%d entries]" % len(entry.chunks)),
cmtime_type,
hashed_path,
)
@ -1135,7 +1136,7 @@ Chunk index: {0.total_unique_chunks:20d} unknown"""
files_cache_logger.debug("UNKNOWN: files cache not implemented")
return False, None
def memorize_file(self, hashed_path, path_hash, st, ids):
def memorize_file(self, hashed_path, path_hash, st, chunks):
pass
def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None, ro_type=ROBJ_FILE_STREAM):