read files cache early, init checkpoint timer after that, see #3394

reading the files cache can take considerable amount of time (a user reported 1h 42min for a 700MB files cache for a repo with 8M files and 15TB total), so we must init the checkpoint timer after that or borg will create the checkpoint too early. creating a checkpoint means (among other stuff) saving the files cache, which will also take a lot of time in such a case, one time too much. doing this in a clean way required some refactoring: - cache_mode is now given to Cache initializer and stored in instance - the files cache is loaded early in _do_open (if needed)
2025-02-22 06:01:54 +00:00 · 2018-03-08 03:20:56 +01:00 · 2018-03-08 03:20:56 +01:00 · 91e5e231f1
commit 91e5e231f1
parent 5b824f54dd
3 changed files with 24 additions and 20 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1131,13 +1131,13 @@ def process_stdin(self, path, cache):
        self.add_item(item, stats=self.stats)
        return 'i'  # stdin

-    def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def process_file(self, path, st, cache, ignore_inode=False):
        with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master):  # no status yet
            is_special_file = is_special(st.st_mode)
            if not hardlinked or hardlink_master:
                if not is_special_file:
                    path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
-                    known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode)
+                    known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
                else:
                    # in --read-special mode, we may be called for special files.
                    # there should be no information in the cache about special files processed in
@ -1172,7 +1172,7 @@ def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEF
                    if not is_special_file:
                        # we must not memorize special files, because the contents of e.g. a
                        # block or char device will change without its mtime/size/inode changing.
-                        cache.memorize_file(path_hash, st, [c.id for c in item.chunks], files_cache_mode)
+                        cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
                self.stats.nfiles += 1
            item.update(self.metadata_collector.stat_attrs(st, path))
            item.get_size(memorize=True)
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -144,7 +144,8 @@ def wrapper(self, args, **kwargs):
                if cache:
                    with Cache(repository, kwargs['key'], kwargs['manifest'],
                               do_files=getattr(args, 'cache_files', False),
-                               progress=getattr(args, 'progress', False), lock_wait=self.lock_wait) as cache_:
+                               progress=getattr(args, 'progress', False), lock_wait=self.lock_wait,
+                               cache_mode=getattr(args, 'files_cache_mode', DEFAULT_FILES_CACHE_MODE)) as cache_:
                        return method(self, args, repository=repository, cache=cache_, **kwargs)
                else:
                    return method(self, args, repository=repository, **kwargs)
@ -504,13 +505,13 @@ def create_inner(archive, cache, fso):
        self.ignore_inode = args.ignore_inode
        self.nobsdflags = args.nobsdflags
        self.exclude_nodump = args.exclude_nodump
-        self.files_cache_mode = args.files_cache_mode
        dry_run = args.dry_run
        t0 = datetime.utcnow()
        t0_monotonic = time.monotonic()
        if not dry_run:
            with Cache(repository, key, manifest, do_files=args.cache_files, progress=args.progress,
-                       lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync) as cache:
+                       lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync,
+                       cache_mode=args.files_cache_mode) as cache:
                archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
                                  create=True, checkpoint_interval=args.checkpoint_interval,
                                  numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime,
@ -576,7 +577,7 @@ def _process(self, fso, cache, matcher, exclude_caches, exclude_if_present,
                        return
            if stat.S_ISREG(st.st_mode):
                if not dry_run:
-                    status = fso.process_file(path, st, cache, self.ignore_inode, self.files_cache_mode)
+                    status = fso.process_file(path, st, cache, self.ignore_inode)
            elif stat.S_ISDIR(st.st_mode):
                if recurse:
                    tag_paths = dir_is_tagged(path, exclude_caches, exclude_if_present)
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -359,11 +359,11 @@ def destroy(repository, path=None):
            shutil.rmtree(path)

    def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
-                progress=False, lock_wait=None, permit_adhoc_cache=False):
+                progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
        def local():
            return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync,
                              do_files=do_files, warn_if_unencrypted=warn_if_unencrypted, progress=progress,
-                              lock_wait=lock_wait)
+                              lock_wait=lock_wait, cache_mode=cache_mode)

        def adhoc():
            return AdHocCache(repository=repository, key=key, manifest=manifest)
@ -422,18 +422,20 @@ class LocalCache(CacheStatsMixin):
    """

    def __init__(self, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
-                 progress=False, lock_wait=None):
+                 progress=False, lock_wait=None, cache_mode=DEFAULT_FILES_CACHE_MODE):
        """
        :param do_files: use file metadata cache
        :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository
        :param lock_wait: timeout for lock acquisition (None: return immediately if lock unavailable)
        :param sync: do :meth:`.sync`
+        :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
        """
        self.repository = repository
        self.key = key
        self.manifest = manifest
        self.progress = progress
        self.do_files = do_files
+        self.cache_mode = cache_mode
        self.timestamp = None
        self.txn_active = False

@ -485,7 +487,10 @@ def _do_open(self):
        with IntegrityCheckedFile(path=os.path.join(self.path, 'chunks'), write=False,
                                  integrity_data=self.cache_config.integrity.get('chunks')) as fd:
            self.chunks = ChunkIndex.read(fd)
-        self.files = None
+        if 'd' in self.cache_mode or not self.do_files:  # d(isabled)
+            self.files = None
+        else:
+            self._read_files()

    def open(self):
        if not os.path.isdir(self.path):
@ -917,7 +922,7 @@ def chunk_decref(self, id, stats, wait=True):
        else:
            stats.update(-size, -csize, False)

-    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
        """
        Check if we know the file that has this path_hash (know == it is in our files cache) and
        whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode).
@ -925,18 +930,15 @@ def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode
        :param path_hash: hash(file_path), to save some memory in the files cache
        :param st: the file's stat() result
        :param ignore_inode: whether the inode number shall be ignored
-        :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
        :return: known, ids (known is True if we have infos about this file in the cache,
                             ids is the list of chunk ids IF the file has not changed, otherwise None).
        """
+        cache_mode = self.cache_mode
        if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):  # d(isabled)
            return False, None
-        if self.files is None:
-            self._read_files()
        # note: r(echunk) does not need the files cache in this method, but the files cache will
        # be updated and saved to disk to memorize the files. To preserve previous generations in
-        # the cache, this means that it also needs to get loaded from disk first, so keep
-        # _read_files() above here.
+        # the cache, this means that it also needs to get loaded from disk first.
        if 'r' in cache_mode:  # r(echunk)
            return False, None
        entry = self.files.get(path_hash)
@ -963,7 +965,8 @@ def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode
        self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
        return True, entry.chunk_ids

-    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def memorize_file(self, path_hash, st, ids):
+        cache_mode = self.cache_mode
        # note: r(echunk) modes will update the files cache, d(isabled) mode won't
        if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):
            return
@ -1014,10 +1017,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
    files = None
    do_files = False

-    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
        return False, None

-    def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
+    def memorize_file(self, path_hash, st, ids):
        pass

    def add_chunk(self, id, chunk, stats, overwrite=False, wait=True):