files cache: add chunk size information

the files cache used to have only the chunk ids, so it had to rely on the chunks index having the size information - which is problematic with e.g. the AdhocCache (has size==0 for all not new chunks) and blocked using the files cache there.
2023-09-19 22:47:15 +02:00 · 2023-09-19 22:47:15 +02:00 · c5e130d03d
parent 411c763fb8
commit c5e130d03d
2 changed files with 20 additions and 17 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1552,25 +1552,27 @@ class FilesystemObjectProcessors:
                            started_hashing = time.monotonic()
                            path_hash = self.key.id_hash(hashed_path)
                            self.stats.hashing_time += time.monotonic() - started_hashing
-                            known, ids = cache.file_known_and_unchanged(hashed_path, path_hash, st)
+                            known, chunks = cache.file_known_and_unchanged(hashed_path, path_hash, st)
                        else:
                            # in --read-special mode, we may be called for special files.
                            # there should be no information in the cache about special files processed in
                            # read-special mode, but we better play safe as this was wrong in the past:
                            hashed_path = path_hash = None
-                            known, ids = False, None
-                        if ids is not None:
+                            known, chunks = False, None
+                        if chunks is not None:
                            # Make sure all ids are available
-                            for id_ in ids:
-                                if not cache.seen_chunk(id_):
+                            for chunk in chunks:
+                                if not cache.seen_chunk(chunk.id):
                                    # cache said it is unmodified, but we lost a chunk: process file like modified
                                    status = "M"
                                    break
                            else:
                                item.chunks = []
-                                for chunk_id in ids:
+                                for chunk in chunks:
                                    # process one-by-one, so we will know in item.chunks how far we got
-                                    chunk_entry = cache.chunk_incref(chunk_id, self.stats)
+                                    chunk_entry = cache.chunk_incref(chunk.id, self.stats)
+                                    # chunk.size is from files cache, chunk_entry.size from index:
+                                    assert chunk == chunk_entry
                                    item.chunks.append(chunk_entry)
                                status = "U"  # regular file, unchanged
                        else:
@ -1606,7 +1608,7 @@ class FilesystemObjectProcessors:
                                # block or char device will change without its mtime/size/inode changing.
                                # also, we must not memorize a potentially inconsistent/corrupt file that
                                # changed while we backed it up.
-                                cache.memorize_file(hashed_path, path_hash, st, [c.id for c in item.chunks])
+                                cache.memorize_file(hashed_path, path_hash, st, item.chunks)
                        self.stats.files_stats[status] += 1  # must be done late
                        if not changed_while_backup:
                            status = None  # we already called print_file_status
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -35,8 +35,8 @@ from .platform import SaveFile
 from .remote import cache_if_remote
 from .repository import LIST_SCAN_LIMIT

-# note: cmtime might me either a ctime or a mtime timestamp
-FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunk_ids")
+# note: cmtime might be either a ctime or a mtime timestamp, chunks is a list of ChunkListEntry
+FileCacheEntry = namedtuple("FileCacheEntry", "age inode size cmtime chunks")


 class SecurityManager:
@ -1016,8 +1016,8 @@ class LocalCache(CacheStatsMixin):
        :param hashed_path: the file's path as we gave it to hash(hashed_path)
        :param path_hash: hash(hashed_path), to save some memory in the files cache
        :param st: the file's stat() result
-        :return: known, ids (known is True if we have infos about this file in the cache,
-                             ids is the list of chunk ids IF the file has not changed, otherwise None).
+        :return: known, chunks (known is True if we have infos about this file in the cache,
+                               chunks is a list[ChunkListEntry] IF the file has not changed, otherwise None).
        """
        if not stat.S_ISREG(st.st_mode):
            return False, None
@ -1058,9 +1058,10 @@ class LocalCache(CacheStatsMixin):
        # again at that time), we need to update the inode number in the cache with what
        # we see in the filesystem.
        self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
-        return True, entry.chunk_ids
+        chunks = [ChunkListEntry(*chunk) for chunk in entry.chunks]  # convert to list of namedtuple
+        return True, chunks

-    def memorize_file(self, hashed_path, path_hash, st, ids):
+    def memorize_file(self, hashed_path, path_hash, st, chunks):
        if not stat.S_ISREG(st.st_mode):
            return
        cache_mode = self.cache_mode
@ -1078,13 +1079,13 @@ class LocalCache(CacheStatsMixin):
            cmtime_type = "ctime"
            cmtime_ns = safe_ns(st.st_ctime_ns)
        entry = FileCacheEntry(
-            age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids
+            age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunks=chunks
        )
        self.files[path_hash] = msgpack.packb(entry)
        self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns)
        files_cache_logger.debug(
            "FILES-CACHE-UPDATE: put %r [has %s] <- %r",
-            entry._replace(chunk_ids="[%d entries]" % len(entry.chunk_ids)),
+            entry._replace(chunks="[%d entries]" % len(entry.chunks)),
            cmtime_type,
            hashed_path,
        )
@ -1135,7 +1136,7 @@ Chunk index:    {0.total_unique_chunks:20d}             unknown"""
        files_cache_logger.debug("UNKNOWN: files cache not implemented")
        return False, None

-    def memorize_file(self, hashed_path, path_hash, st, ids):
+    def memorize_file(self, hashed_path, path_hash, st, chunks):
        pass

    def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None, ro_type=ROBJ_FILE_STREAM):