fix borg create never showing M status

the problem was that the upper layer code did not have enough information about the file, whether it is known or not - and thus, could not decide correctly whether status should be M)odified or A)dded. now, file_known_and_unchanged method returns an additional "known" boolean to fix this. also: add comment about files cache loading in cache_mode='r'
2024-12-26 09:47:58 +00:00 · 2018-02-23 14:48:24 +01:00 · 2018-02-23 14:48:24 +01:00 · 4e0f369d0a
commit 4e0f369d0a
parent 1922ae3242
4 changed files with 32 additions and 15 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1132,12 +1132,13 @@ def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEF
            if not hardlinked or hardlink_master:
                if not is_special_file:
                    path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
-                    ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode)
+                    known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode)
                else:
                    # in --read-special mode, we may be called for special files.
                    # there should be no information in the cache about special files processed in
                    # read-special mode, but we better play safe as this was wrong in the past:
-                    path_hash = ids = None
+                    path_hash = None
+                    known, ids = False, None
                first_run = not cache.files and cache.do_files
                if first_run:
                    logger.debug('Processing files ...')
@ -1146,12 +1147,13 @@ def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEF
                    # Make sure all ids are available
                    for id_ in ids:
                        if not cache.seen_chunk(id_):
+                            status = 'M'  # cache said it is unmodified, but we lost a chunk: process file like modified
                            break
                    else:
                        chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
                        status = 'U'  # regular file, unchanged
                else:
-                    status = 'A'  # regular file, added
+                    status = 'M' if known else 'A'  # regular file, modified or added
                item.hardlink_master = hardlinked
                item.update(self.metadata_collector.stat_simple_attrs(st))
                # Only chunkify the file if needed
@ -1166,7 +1168,6 @@ def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEF
                        # we must not memorize special files, because the contents of e.g. a
                        # block or char device will change without its mtime/size/inode changing.
                        cache.memorize_file(path_hash, st, [c.id for c in item.chunks], files_cache_mode)
-                    status = status or 'M'  # regular file, modified (if not 'A' already)
                self.stats.nfiles += 1
            item.update(self.metadata_collector.stat_attrs(st, path))
            item.get_size(memorize=True)
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -918,24 +918,40 @@ def chunk_decref(self, id, stats, wait=True):
            stats.update(-size, -csize, False)

    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
+        """
+        Check if we know the file that has this path_hash (know == it is in our files cache) and
+        whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode).
+
+        :param path_hash: hash(file_path), to save some memory in the files cache
+        :param st: the file's stat() result
+        :param ignore_inode: whether the inode number shall be ignored
+        :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
+        :return: known, ids (known is True if we have infos about this file in the cache,
+                             ids is the list of chunk ids IF the file has not changed, otherwise None).
+        """
        if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):  # d(isabled)
-            return None
+            return False, None
        if self.files is None:
            self._read_files()
+        # note: r(echunk) does not need the files cache in this method, but the files cache will
+        # be updated and saved to disk to memorize the files. To preserve previous generations in
+        # the cache, this means that it also needs to get loaded from disk first, so keep
+        # _read_files() above here.
        if 'r' in cache_mode:  # r(echunk)
-            return None
+            return False, None
        entry = self.files.get(path_hash)
        if not entry:
-            return None
+            return False, None
+        # we know the file!
        entry = FileCacheEntry(*msgpack.unpackb(entry))
        if 's' in cache_mode and entry.size != st.st_size:
-            return None
+            return True, None
        if 'i' in cache_mode and not ignore_inode and entry.inode != st.st_ino:
-            return None
+            return True, None
        if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns:
-            return None
+            return True, None
        elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns:
-            return None
+            return True, None
        # we ignored the inode number in the comparison above or it is still same.
        # if it is still the same, replacing it in the tuple doesn't change it.
        # if we ignored it, a reason for doing that is that files were moved to a new
@ -945,7 +961,7 @@ def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode
        # again at that time), we need to update the inode number in the cache with what
        # we see in the filesystem.
        self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
-        return entry.chunk_ids
+        return True, entry.chunk_ids

    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
        # note: r(echunk) modes will update the files cache, d(isabled) mode won't
@ -999,7 +1015,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
    do_files = False

    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
-        return None
+        return False, None

    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
        pass
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -1678,7 +1678,7 @@ def test_file_status_cs_cache_mode(self):
        os.utime('input/file1', ns=(st.st_atime_ns, st.st_mtime_ns))
        # this mode uses ctime for change detection, so it should find file1 as modified
        output = self.cmd('create', '--list', '--files-cache=ctime,size', self.repository_location + '::test2', 'input')
-        self.assert_in("A input/file1", output)
+        self.assert_in("M input/file1", output)

    def test_file_status_ms_cache_mode(self):
        """test that a chmod'ed file with no content changes does not get chunked again in mtime,size cache_mode"""
--- a/src/borg/testsuite/cache.py
+++ b/src/borg/testsuite/cache.py
@ -256,7 +256,7 @@ def test_deletes_chunks_during_lifetime(self, cache, repository):
            repository.get(H(5))

    def test_files_cache(self, cache):
-        assert cache.file_known_and_unchanged(bytes(32), None) is None
+        assert cache.file_known_and_unchanged(bytes(32), None) == (False, None)
        assert not cache.do_files
        assert cache.files is None