From e9af3c6ab361decee02e83981253b28f5871ed98 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 8 Mar 2018 03:20:56 +0100 Subject: [PATCH 1/4] read files cache early, init checkpoint timer after that, see #3394 reading the files cache can take considerable amount of time (a user reported 1h 42min for a 700MB files cache for a repo with 8M files and 15TB total), so we must init the checkpoint timer after that or borg will create the checkpoint too early. creating a checkpoint means (among other stuff) saving the files cache, which will also take a lot of time in such a case, one time too much. doing this in a clean way required some refactoring: - cache_mode is now given to Cache initializer and stored in instance - the files cache is loaded early in _do_open (if needed) --- src/borg/archive.py | 6 +++--- src/borg/archiver.py | 9 +++++---- src/borg/cache.py | 29 ++++++++++++++++------------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 5e17431e6..db261325c 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -980,13 +980,13 @@ Utilization of max. archive size: {csize_max:.0%} self.add_item(item) return 'i' # stdin - def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEFAULT_FILES_CACHE_MODE): + def process_file(self, path, st, cache, ignore_inode=False): with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet is_special_file = is_special(st.st_mode) if not hardlinked or hardlink_master: if not is_special_file: path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) - known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode) + known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) else: # in --read-special mode, we may be called for special files. # there should be no information in the cache about special files processed in @@ -1021,7 +1021,7 @@ Utilization of max. archive size: {csize_max:.0%} if not is_special_file: # we must not memorize special files, because the contents of e.g. a # block or char device will change without its mtime/size/inode changing. - cache.memorize_file(path_hash, st, [c.id for c in item.chunks], files_cache_mode) + cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) self.stats.nfiles += 1 item.update(self.stat_attrs(st, path)) item.get_size(memorize=True) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 7a5cedb20..4f4d63a45 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -144,7 +144,8 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, if cache: with Cache(repository, kwargs['key'], kwargs['manifest'], do_files=getattr(args, 'cache_files', False), - progress=getattr(args, 'progress', False), lock_wait=self.lock_wait) as cache_: + progress=getattr(args, 'progress', False), lock_wait=self.lock_wait, + cache_mode=getattr(args, 'files_cache_mode', DEFAULT_FILES_CACHE_MODE)) as cache_: return method(self, args, repository=repository, cache=cache_, **kwargs) else: return method(self, args, repository=repository, **kwargs) @@ -529,13 +530,13 @@ class Archiver: self.output_list = args.output_list self.ignore_inode = args.ignore_inode self.exclude_nodump = args.exclude_nodump - self.files_cache_mode = args.files_cache_mode dry_run = args.dry_run t0 = datetime.utcnow() t0_monotonic = time.monotonic() if not dry_run: with Cache(repository, key, manifest, do_files=args.cache_files, progress=args.progress, - lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync) as cache: + lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync, + cache_mode=args.files_cache_mode) as cache: archive = Archive(repository, key, manifest, args.location.archive, cache=cache, create=True, checkpoint_interval=args.checkpoint_interval, numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime, nobirthtime=args.nobirthtime, @@ -593,7 +594,7 @@ class Archiver: return if stat.S_ISREG(st.st_mode): if not dry_run: - status = archive.process_file(path, st, cache, self.ignore_inode, self.files_cache_mode) + status = archive.process_file(path, st, cache, self.ignore_inode) elif stat.S_ISDIR(st.st_mode): if recurse: tag_paths = dir_is_tagged(path, exclude_caches, exclude_if_present) diff --git a/src/borg/cache.py b/src/borg/cache.py index 062829f74..c9c2b65d9 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -359,11 +359,11 @@ class Cache: shutil.rmtree(path) def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True, - progress=False, lock_wait=None, permit_adhoc_cache=False): + progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE): def local(): return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync, do_files=do_files, warn_if_unencrypted=warn_if_unencrypted, progress=progress, - lock_wait=lock_wait) + lock_wait=lock_wait, cache_mode=cache_mode) def adhoc(): return AdHocCache(repository=repository, key=key, manifest=manifest) @@ -422,18 +422,20 @@ class LocalCache(CacheStatsMixin): """ def __init__(self, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True, - progress=False, lock_wait=None): + progress=False, lock_wait=None, cache_mode=DEFAULT_FILES_CACHE_MODE): """ :param do_files: use file metadata cache :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository :param lock_wait: timeout for lock acquisition (None: return immediately if lock unavailable) :param sync: do :meth:`.sync` + :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison """ self.repository = repository self.key = key self.manifest = manifest self.progress = progress self.do_files = do_files + self.cache_mode = cache_mode self.timestamp = None self.txn_active = False @@ -485,7 +487,10 @@ class LocalCache(CacheStatsMixin): with IntegrityCheckedFile(path=os.path.join(self.path, 'chunks'), write=False, integrity_data=self.cache_config.integrity.get('chunks')) as fd: self.chunks = ChunkIndex.read(fd) - self.files = None + if 'd' in self.cache_mode or not self.do_files: # d(isabled) + self.files = None + else: + self._read_files() def open(self): if not os.path.isdir(self.path): @@ -917,7 +922,7 @@ class LocalCache(CacheStatsMixin): else: stats.update(-size, -csize, False) - def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE): + def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): """ Check if we know the file that has this path_hash (know == it is in our files cache) and whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode). @@ -925,18 +930,15 @@ class LocalCache(CacheStatsMixin): :param path_hash: hash(file_path), to save some memory in the files cache :param st: the file's stat() result :param ignore_inode: whether the inode number shall be ignored - :param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison :return: known, ids (known is True if we have infos about this file in the cache, ids is the list of chunk ids IF the file has not changed, otherwise None). """ + cache_mode = self.cache_mode if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode): # d(isabled) return False, None - if self.files is None: - self._read_files() # note: r(echunk) does not need the files cache in this method, but the files cache will # be updated and saved to disk to memorize the files. To preserve previous generations in - # the cache, this means that it also needs to get loaded from disk first, so keep - # _read_files() above here. + # the cache, this means that it also needs to get loaded from disk first. if 'r' in cache_mode: # r(echunk) return False, None entry = self.files.get(path_hash) @@ -963,7 +965,8 @@ class LocalCache(CacheStatsMixin): self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0)) return True, entry.chunk_ids - def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE): + def memorize_file(self, path_hash, st, ids): + cache_mode = self.cache_mode # note: r(echunk) modes will update the files cache, d(isabled) mode won't if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode): return @@ -1014,10 +1017,10 @@ Chunk index: {0.total_unique_chunks:20d} unknown""" files = None do_files = False - def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE): + def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): return False, None - def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE): + def memorize_file(self, path_hash, st, ids): pass def add_chunk(self, id, chunk, stats, overwrite=False, wait=True): From c27c98ced03d8fece120cdf18fbe2ebe67a87bb3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 8 Mar 2018 03:39:38 +0100 Subject: [PATCH 2/4] cleanup: get rid of Cache.do_files, replace with cache_mode not do_files == (cache_mode == 'd') # d)isabled --- src/borg/archive.py | 2 +- src/borg/archiver.py | 2 +- src/borg/cache.py | 18 ++++++++++-------- src/borg/testsuite/cache.py | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index db261325c..dea3bdfe7 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -993,7 +993,7 @@ Utilization of max. archive size: {csize_max:.0%} # read-special mode, but we better play safe as this was wrong in the past: path_hash = None known, ids = False, None - first_run = not cache.files and cache.do_files + first_run = not cache.files and 'd' not in cache.cache_mode if first_run: logger.debug('Processing files ...') chunks = None diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 4f4d63a45..18e5fc1c8 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -1543,7 +1543,7 @@ class Archiver: keep += prune_split(archives, '%Y', args.yearly, keep) to_delete = (set(archives) | checkpoints) - (set(keep) | set(keep_checkpoints)) stats = Statistics() - with Cache(repository, key, manifest, do_files=False, lock_wait=self.lock_wait) as cache: + with Cache(repository, key, manifest, lock_wait=self.lock_wait) as cache: list_logger = logging.getLogger('borg.output.list') if args.output_list: # set up counters for the progress display diff --git a/src/borg/cache.py b/src/borg/cache.py index c9c2b65d9..63bfe0deb 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -360,9 +360,13 @@ class Cache: def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True, progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE): + + if not do_files and 'd' not in cache_mode: + cache_mode = 'd' + def local(): return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync, - do_files=do_files, warn_if_unencrypted=warn_if_unencrypted, progress=progress, + warn_if_unencrypted=warn_if_unencrypted, progress=progress, lock_wait=lock_wait, cache_mode=cache_mode) def adhoc(): @@ -421,10 +425,9 @@ class LocalCache(CacheStatsMixin): Persistent, local (client-side) cache. """ - def __init__(self, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True, + def __init__(self, repository, key, manifest, path=None, sync=True, warn_if_unencrypted=True, progress=False, lock_wait=None, cache_mode=DEFAULT_FILES_CACHE_MODE): """ - :param do_files: use file metadata cache :param warn_if_unencrypted: print warning if accessing unknown unencrypted repository :param lock_wait: timeout for lock acquisition (None: return immediately if lock unavailable) :param sync: do :meth:`.sync` @@ -434,7 +437,6 @@ class LocalCache(CacheStatsMixin): self.key = key self.manifest = manifest self.progress = progress - self.do_files = do_files self.cache_mode = cache_mode self.timestamp = None self.txn_active = False @@ -487,7 +489,7 @@ class LocalCache(CacheStatsMixin): with IntegrityCheckedFile(path=os.path.join(self.path, 'chunks'), write=False, integrity_data=self.cache_config.integrity.get('chunks')) as fd: self.chunks = ChunkIndex.read(fd) - if 'd' in self.cache_mode or not self.do_files: # d(isabled) + if 'd' in self.cache_mode: # d(isabled) self.files = None else: self._read_files() @@ -934,7 +936,7 @@ class LocalCache(CacheStatsMixin): ids is the list of chunk ids IF the file has not changed, otherwise None). """ cache_mode = self.cache_mode - if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode): # d(isabled) + if 'd' in cache_mode or not stat.S_ISREG(st.st_mode): # d(isabled) return False, None # note: r(echunk) does not need the files cache in this method, but the files cache will # be updated and saved to disk to memorize the files. To preserve previous generations in @@ -968,7 +970,7 @@ class LocalCache(CacheStatsMixin): def memorize_file(self, path_hash, st, ids): cache_mode = self.cache_mode # note: r(echunk) modes will update the files cache, d(isabled) mode won't - if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode): + if 'd' in cache_mode or not stat.S_ISREG(st.st_mode): return if 'c' in cache_mode: cmtime_ns = safe_ns(st.st_ctime_ns) @@ -1015,7 +1017,7 @@ Chunk index: {0.total_unique_chunks:20d} unknown""" pass files = None - do_files = False + cache_mode = 'd' def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): return False, None diff --git a/src/borg/testsuite/cache.py b/src/borg/testsuite/cache.py index b25e72b6f..31ebf55af 100644 --- a/src/borg/testsuite/cache.py +++ b/src/borg/testsuite/cache.py @@ -257,7 +257,7 @@ class TestAdHocCache: def test_files_cache(self, cache): assert cache.file_known_and_unchanged(bytes(32), None) == (False, None) - assert not cache.do_files + assert cache.cache_mode == 'd' assert cache.files is None def test_txn(self, cache): From e4125b4b64e6022135031ebe6ea6266361a59c19 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 8 Mar 2018 04:10:43 +0100 Subject: [PATCH 3/4] cleanup: get rid of ignore_inode, replace with cache_mode ignore_inode == ('i' not in cache_mode) # i)node --- src/borg/archive.py | 4 ++-- src/borg/archiver.py | 6 +++--- src/borg/cache.py | 12 +++++++----- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index dea3bdfe7..514de1aff 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -980,13 +980,13 @@ Utilization of max. archive size: {csize_max:.0%} self.add_item(item) return 'i' # stdin - def process_file(self, path, st, cache, ignore_inode=False): + def process_file(self, path, st, cache): with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet is_special_file = is_special(st.st_mode) if not hardlinked or hardlink_master: if not is_special_file: path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) - known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) + known, ids = cache.file_known_and_unchanged(path_hash, st) else: # in --read-special mode, we may be called for special files. # there should be no information in the cache about special files processed in diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 18e5fc1c8..ca3c27be4 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -144,6 +144,7 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, if cache: with Cache(repository, kwargs['key'], kwargs['manifest'], do_files=getattr(args, 'cache_files', False), + ignore_inode=getattr(args, 'ignore_inode', False), progress=getattr(args, 'progress', False), lock_wait=self.lock_wait, cache_mode=getattr(args, 'files_cache_mode', DEFAULT_FILES_CACHE_MODE)) as cache_: return method(self, args, repository=repository, cache=cache_, **kwargs) @@ -528,7 +529,6 @@ class Archiver: self.output_filter = args.output_filter self.output_list = args.output_list - self.ignore_inode = args.ignore_inode self.exclude_nodump = args.exclude_nodump dry_run = args.dry_run t0 = datetime.utcnow() @@ -536,7 +536,7 @@ class Archiver: if not dry_run: with Cache(repository, key, manifest, do_files=args.cache_files, progress=args.progress, lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync, - cache_mode=args.files_cache_mode) as cache: + cache_mode=args.files_cache_mode, ignore_inode=args.ignore_inode) as cache: archive = Archive(repository, key, manifest, args.location.archive, cache=cache, create=True, checkpoint_interval=args.checkpoint_interval, numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime, nobirthtime=args.nobirthtime, @@ -594,7 +594,7 @@ class Archiver: return if stat.S_ISREG(st.st_mode): if not dry_run: - status = archive.process_file(path, st, cache, self.ignore_inode) + status = archive.process_file(path, st, cache) elif stat.S_ISDIR(st.st_mode): if recurse: tag_paths = dir_is_tagged(path, exclude_caches, exclude_if_present) diff --git a/src/borg/cache.py b/src/borg/cache.py index 63bfe0deb..5ab3d5f12 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -359,10 +359,13 @@ class Cache: shutil.rmtree(path) def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True, - progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE): + progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE, + ignore_inode=False): if not do_files and 'd' not in cache_mode: cache_mode = 'd' + elif ignore_inode and 'i' in cache_mode: + cache_mode = ''.join(set(cache_mode) - set('i')) def local(): return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync, @@ -924,14 +927,13 @@ class LocalCache(CacheStatsMixin): else: stats.update(-size, -csize, False) - def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): + def file_known_and_unchanged(self, path_hash, st): """ Check if we know the file that has this path_hash (know == it is in our files cache) and whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode). :param path_hash: hash(file_path), to save some memory in the files cache :param st: the file's stat() result - :param ignore_inode: whether the inode number shall be ignored :return: known, ids (known is True if we have infos about this file in the cache, ids is the list of chunk ids IF the file has not changed, otherwise None). """ @@ -950,7 +952,7 @@ class LocalCache(CacheStatsMixin): entry = FileCacheEntry(*msgpack.unpackb(entry)) if 's' in cache_mode and entry.size != st.st_size: return True, None - if 'i' in cache_mode and not ignore_inode and entry.inode != st.st_ino: + if 'i' in cache_mode and entry.inode != st.st_ino: return True, None if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns: return True, None @@ -1019,7 +1021,7 @@ Chunk index: {0.total_unique_chunks:20d} unknown""" files = None cache_mode = 'd' - def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): + def file_known_and_unchanged(self, path_hash, st): return False, None def memorize_file(self, path_hash, st, ids): From d3274cd2c94f8d39ea8ecaea700aa05a1a24ecb8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 8 Mar 2018 04:21:58 +0100 Subject: [PATCH 4/4] cleanup: move "processing files" message to expected place (now possible as we do not lazy load the files cache any more) --- src/borg/archive.py | 3 --- src/borg/archiver.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 514de1aff..cc828b7dc 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -993,9 +993,6 @@ Utilization of max. archive size: {csize_max:.0%} # read-special mode, but we better play safe as this was wrong in the past: path_hash = None known, ids = False, None - first_run = not cache.files and 'd' not in cache.cache_mode - if first_run: - logger.debug('Processing files ...') chunks = None if ids is not None: # Make sure all ids are available diff --git a/src/borg/archiver.py b/src/borg/archiver.py index ca3c27be4..d5fc9f38f 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -482,6 +482,7 @@ class Archiver: skip_inodes.add((st.st_ino, st.st_dev)) except OSError: pass + logger.debug('Processing files ...') for path in args.paths: if path == '-': # stdin path = 'stdin'