mirror of
https://github.com/borgbackup/borg.git
synced 2025-02-22 06:01:54 +00:00
read files cache early, init checkpoint timer after that, see #3394
reading the files cache can take considerable amount of time (a user reported 1h 42min for a 700MB files cache for a repo with 8M files and 15TB total), so we must init the checkpoint timer after that or borg will create the checkpoint too early. creating a checkpoint means (among other stuff) saving the files cache, which will also take a lot of time in such a case, one time too much. doing this in a clean way required some refactoring: - cache_mode is now given to Cache initializer and stored in instance - the files cache is loaded early in _do_open (if needed)
This commit is contained in:
parent
5b824f54dd
commit
91e5e231f1
3 changed files with 24 additions and 20 deletions
|
@ -1131,13 +1131,13 @@ def process_stdin(self, path, cache):
|
|||
self.add_item(item, stats=self.stats)
|
||||
return 'i' # stdin
|
||||
|
||||
def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def process_file(self, path, st, cache, ignore_inode=False):
|
||||
with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet
|
||||
is_special_file = is_special(st.st_mode)
|
||||
if not hardlinked or hardlink_master:
|
||||
if not is_special_file:
|
||||
path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
|
||||
known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode, files_cache_mode)
|
||||
known, ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
|
||||
else:
|
||||
# in --read-special mode, we may be called for special files.
|
||||
# there should be no information in the cache about special files processed in
|
||||
|
@ -1172,7 +1172,7 @@ def process_file(self, path, st, cache, ignore_inode=False, files_cache_mode=DEF
|
|||
if not is_special_file:
|
||||
# we must not memorize special files, because the contents of e.g. a
|
||||
# block or char device will change without its mtime/size/inode changing.
|
||||
cache.memorize_file(path_hash, st, [c.id for c in item.chunks], files_cache_mode)
|
||||
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
|
||||
self.stats.nfiles += 1
|
||||
item.update(self.metadata_collector.stat_attrs(st, path))
|
||||
item.get_size(memorize=True)
|
||||
|
|
|
@ -144,7 +144,8 @@ def wrapper(self, args, **kwargs):
|
|||
if cache:
|
||||
with Cache(repository, kwargs['key'], kwargs['manifest'],
|
||||
do_files=getattr(args, 'cache_files', False),
|
||||
progress=getattr(args, 'progress', False), lock_wait=self.lock_wait) as cache_:
|
||||
progress=getattr(args, 'progress', False), lock_wait=self.lock_wait,
|
||||
cache_mode=getattr(args, 'files_cache_mode', DEFAULT_FILES_CACHE_MODE)) as cache_:
|
||||
return method(self, args, repository=repository, cache=cache_, **kwargs)
|
||||
else:
|
||||
return method(self, args, repository=repository, **kwargs)
|
||||
|
@ -504,13 +505,13 @@ def create_inner(archive, cache, fso):
|
|||
self.ignore_inode = args.ignore_inode
|
||||
self.nobsdflags = args.nobsdflags
|
||||
self.exclude_nodump = args.exclude_nodump
|
||||
self.files_cache_mode = args.files_cache_mode
|
||||
dry_run = args.dry_run
|
||||
t0 = datetime.utcnow()
|
||||
t0_monotonic = time.monotonic()
|
||||
if not dry_run:
|
||||
with Cache(repository, key, manifest, do_files=args.cache_files, progress=args.progress,
|
||||
lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync) as cache:
|
||||
lock_wait=self.lock_wait, permit_adhoc_cache=args.no_cache_sync,
|
||||
cache_mode=args.files_cache_mode) as cache:
|
||||
archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
|
||||
create=True, checkpoint_interval=args.checkpoint_interval,
|
||||
numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime,
|
||||
|
@ -576,7 +577,7 @@ def _process(self, fso, cache, matcher, exclude_caches, exclude_if_present,
|
|||
return
|
||||
if stat.S_ISREG(st.st_mode):
|
||||
if not dry_run:
|
||||
status = fso.process_file(path, st, cache, self.ignore_inode, self.files_cache_mode)
|
||||
status = fso.process_file(path, st, cache, self.ignore_inode)
|
||||
elif stat.S_ISDIR(st.st_mode):
|
||||
if recurse:
|
||||
tag_paths = dir_is_tagged(path, exclude_caches, exclude_if_present)
|
||||
|
|
|
@ -359,11 +359,11 @@ def destroy(repository, path=None):
|
|||
shutil.rmtree(path)
|
||||
|
||||
def __new__(cls, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
|
||||
progress=False, lock_wait=None, permit_adhoc_cache=False):
|
||||
progress=False, lock_wait=None, permit_adhoc_cache=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def local():
|
||||
return LocalCache(repository=repository, key=key, manifest=manifest, path=path, sync=sync,
|
||||
do_files=do_files, warn_if_unencrypted=warn_if_unencrypted, progress=progress,
|
||||
lock_wait=lock_wait)
|
||||
lock_wait=lock_wait, cache_mode=cache_mode)
|
||||
|
||||
def adhoc():
|
||||
return AdHocCache(repository=repository, key=key, manifest=manifest)
|
||||
|
@ -422,18 +422,20 @@ class LocalCache(CacheStatsMixin):
|
|||
"""
|
||||
|
||||
def __init__(self, repository, key, manifest, path=None, sync=True, do_files=False, warn_if_unencrypted=True,
|
||||
progress=False, lock_wait=None):
|
||||
progress=False, lock_wait=None, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
"""
|
||||
:param do_files: use file metadata cache
|
||||
:param warn_if_unencrypted: print warning if accessing unknown unencrypted repository
|
||||
:param lock_wait: timeout for lock acquisition (None: return immediately if lock unavailable)
|
||||
:param sync: do :meth:`.sync`
|
||||
:param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
|
||||
"""
|
||||
self.repository = repository
|
||||
self.key = key
|
||||
self.manifest = manifest
|
||||
self.progress = progress
|
||||
self.do_files = do_files
|
||||
self.cache_mode = cache_mode
|
||||
self.timestamp = None
|
||||
self.txn_active = False
|
||||
|
||||
|
@ -485,7 +487,10 @@ def _do_open(self):
|
|||
with IntegrityCheckedFile(path=os.path.join(self.path, 'chunks'), write=False,
|
||||
integrity_data=self.cache_config.integrity.get('chunks')) as fd:
|
||||
self.chunks = ChunkIndex.read(fd)
|
||||
self.files = None
|
||||
if 'd' in self.cache_mode or not self.do_files: # d(isabled)
|
||||
self.files = None
|
||||
else:
|
||||
self._read_files()
|
||||
|
||||
def open(self):
|
||||
if not os.path.isdir(self.path):
|
||||
|
@ -917,7 +922,7 @@ def chunk_decref(self, id, stats, wait=True):
|
|||
else:
|
||||
stats.update(-size, -csize, False)
|
||||
|
||||
def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
|
||||
"""
|
||||
Check if we know the file that has this path_hash (know == it is in our files cache) and
|
||||
whether it is unchanged (the size/inode number/cmtime is same for stuff we check in this cache_mode).
|
||||
|
@ -925,18 +930,15 @@ def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode
|
|||
:param path_hash: hash(file_path), to save some memory in the files cache
|
||||
:param st: the file's stat() result
|
||||
:param ignore_inode: whether the inode number shall be ignored
|
||||
:param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
|
||||
:return: known, ids (known is True if we have infos about this file in the cache,
|
||||
ids is the list of chunk ids IF the file has not changed, otherwise None).
|
||||
"""
|
||||
cache_mode = self.cache_mode
|
||||
if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode): # d(isabled)
|
||||
return False, None
|
||||
if self.files is None:
|
||||
self._read_files()
|
||||
# note: r(echunk) does not need the files cache in this method, but the files cache will
|
||||
# be updated and saved to disk to memorize the files. To preserve previous generations in
|
||||
# the cache, this means that it also needs to get loaded from disk first, so keep
|
||||
# _read_files() above here.
|
||||
# the cache, this means that it also needs to get loaded from disk first.
|
||||
if 'r' in cache_mode: # r(echunk)
|
||||
return False, None
|
||||
entry = self.files.get(path_hash)
|
||||
|
@ -963,7 +965,8 @@ def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode
|
|||
self.files[path_hash] = msgpack.packb(entry._replace(inode=st.st_ino, age=0))
|
||||
return True, entry.chunk_ids
|
||||
|
||||
def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def memorize_file(self, path_hash, st, ids):
|
||||
cache_mode = self.cache_mode
|
||||
# note: r(echunk) modes will update the files cache, d(isabled) mode won't
|
||||
if 'd' in cache_mode or not self.do_files or not stat.S_ISREG(st.st_mode):
|
||||
return
|
||||
|
@ -1014,10 +1017,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
|
|||
files = None
|
||||
do_files = False
|
||||
|
||||
def file_known_and_unchanged(self, path_hash, st, ignore_inode=False, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):
|
||||
return False, None
|
||||
|
||||
def memorize_file(self, path_hash, st, ids, cache_mode=DEFAULT_FILES_CACHE_MODE):
|
||||
def memorize_file(self, path_hash, st, ids):
|
||||
pass
|
||||
|
||||
def add_chunk(self, id, chunk, stats, overwrite=False, wait=True):
|
||||
|
|
Loading…
Reference in a new issue