From 5f7b466969e3e66bb2ea9bb6d6216026d5f3a624 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 31 Jul 2016 01:33:46 +0200 Subject: [PATCH] implement BORG_FILES_CACHE_TTL, update FAQ raise default ttl to 20 (previously: 10). --- borg/cache.py | 3 ++- docs/faq.rst | 24 ++++++++++++++++++++++++ docs/usage.rst | 3 +++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/borg/cache.py b/borg/cache.py index 27826b46e..29d0c8a14 100644 --- a/borg/cache.py +++ b/borg/cache.py @@ -193,12 +193,13 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" if not self.txn_active: return if self.files is not None: + ttl = int(os.environ.get('BORG_FILES_CACHE_TTL', 20)) with open(os.path.join(self.path, 'files'), 'wb') as fd: for path_hash, item in self.files.items(): # Discard cached files with the newest mtime to avoid # issues with filesystem snapshots and mtime precision item = msgpack.unpackb(item) - if item[0] < 10 and bigint_to_int(item[3]) < self._newest_mtime: + if item[0] < ttl and bigint_to_int(item[3]) < self._newest_mtime: msgpack.pack((path_hash, item), fd) self.config.set('cache', 'manifest', hexlify(self.manifest.id).decode('ascii')) self.config.set('cache', 'timestamp', self.manifest.timestamp) diff --git a/docs/faq.rst b/docs/faq.rst index 0ea3f0798..5a2d1989d 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -345,6 +345,30 @@ those files are reported as being added when, really, chunks are already used. +It always chunks all my files, even unchanged ones! +--------------------------------------------------- + +|project_name| maintains a files cache where it remembers the mtime, size and +inode of files. When |project_name| does a new backup and starts processing a +file, it first looks whether the file has changed (compared to the values +stored in the files cache). If the values are the same, the file is assumed +unchanged and thus its contents won't get chunked (again). + +|project_name| can't keep an infinite history of files of course, thus entries +in the files cache have a "maximum time to live" which is set via the +environment variable BORG_FILES_CACHE_TTL (and defaults to 20). +Every time you do a backup (on the same machine, using the same user), the +cache entries' ttl values of files that were not "seen" are incremented by 1 +and if they reach BORG_FILES_CACHE_TTL, the entry is removed from the cache. + +So, for example, if you do daily backups of 26 different data sets A, B, +C, ..., Z on one machine (using the default TTL), the files from A will be +already forgotten when you repeat the same backups on the next day and it +will be slow because it would chunk all the files each time. If you set +BORG_FILES_CACHE_TTL to at least 26 (or maybe even a small multiple of that), +it would be much faster. + + Is there a way to limit bandwidth with |project_name|? ------------------------------------------------------ diff --git a/docs/usage.rst b/docs/usage.rst index fd3027526..82da1f978 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -86,6 +86,9 @@ General: BORG_REMOTE_PATH When set, use the given path/filename as remote path (default is "borg"). Using ``--remote-path PATH`` commandline option overrides the environment variable. + BORG_FILES_CACHE_TTL + When set to a numeric value, this determines the maximum "time to live" for the files cache + entries (default: 20). The files cache is used to quickly determine whether a file is unchanged. TMPDIR where temporary files are stored (might need a lot of temporary space for some operations)