2015-10-02 16:11:10 +00:00
import configparser
2010-10-20 19:08:46 +00:00
import os
2010-12-21 20:29:09 +00:00
import shutil
2017-05-02 17:05:27 +00:00
import stat
2016-05-30 23:18:03 +00:00
from binascii import unhexlify
from collections import namedtuple
2017-06-10 15:59:41 +00:00
from time import perf_counter
2016-05-30 23:18:03 +00:00
2015-10-06 16:33:55 +00:00
from . logger import create_logger
2017-05-02 17:05:27 +00:00
2015-10-06 16:33:55 +00:00
logger = create_logger ( )
2016-05-30 23:18:03 +00:00
2021-02-23 21:56:38 +00:00
files_cache_logger = create_logger ( " borg.debug.files_cache " )
2023-09-15 20:19:29 +00:00
from . constants import CACHE_README , FILES_CACHE_MODE_DISABLED , ROBJ_FILE_STREAM
2017-03-07 14:13:59 +00:00
from . hashindex import ChunkIndex , ChunkIndexEntry , CacheSynchronizer
2016-10-31 04:53:01 +00:00
from . helpers import Location
2016-05-30 22:33:13 +00:00
from . helpers import Error
2016-11-27 11:39:49 +00:00
from . helpers import get_cache_dir , get_security_dir
2022-05-04 08:34:33 +00:00
from . helpers import bin_to_hex , parse_stringified_list
2016-05-30 22:33:13 +00:00
from . helpers import format_file_size
2017-03-15 17:54:34 +00:00
from . helpers import safe_ns
2019-03-03 14:51:40 +00:00
from . helpers import yes
2016-11-26 20:15:59 +00:00
from . helpers import remove_surrogates
from . helpers import ProgressIndicatorPercent , ProgressIndicatorMessage
2017-05-25 13:54:38 +00:00
from . helpers import set_ec , EXIT_WARNING
2022-02-15 18:39:58 +00:00
from . helpers import safe_unlink
2018-07-01 00:34:48 +00:00
from . helpers import msgpack
2022-05-04 08:34:33 +00:00
from . helpers . msgpack import int_to_timestamp , timestamp_to_int
2017-05-02 17:05:27 +00:00
from . item import ArchiveItem , ChunkListEntry
from . crypto . key import PlaintextKey
2017-05-25 12:00:03 +00:00
from . crypto . file_integrity import IntegrityCheckedFile , DetachedIntegrityCheckedFile , FileIntegrityError
2016-07-23 11:56:06 +00:00
from . locking import Lock
2022-08-13 19:55:12 +00:00
from . manifest import Manifest
2016-07-09 19:10:46 +00:00
from . platform import SaveFile
2016-05-30 23:18:03 +00:00
from . remote import cache_if_remote
2017-06-10 15:59:41 +00:00
from . repository import LIST_SCAN_LIMIT
2015-10-08 21:03:35 +00:00
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
# note: cmtime might me either a ctime or a mtime timestamp
FileCacheEntry = namedtuple ( " FileCacheEntry " , " age inode size cmtime chunk_ids " )
2016-04-16 15:48:47 +00:00
2010-03-06 17:25:35 +00:00
2016-11-27 11:39:49 +00:00
class SecurityManager :
2017-05-10 16:34:22 +00:00
"""
Tracks repositories . Ensures that nothing bad happens ( repository swaps ,
replay attacks , unknown repositories etc . ) .
This is complicated by the Cache being initially used for this , while
only some commands actually use the Cache , which meant that other commands
did not perform these checks .
Further complications were created by the Cache being a cache , so it
could be legitimately deleted , which is annoying because Borg didn ' t
recognize repositories after that .
Therefore a second location , the security database ( see get_security_dir ) ,
was introduced which stores this information . However , this means that
the code has to deal with a cache existing but no security DB entry ,
or inconsistencies between the security DB and the cache which have to
be reconciled , and also with no cache existing but a security DB entry .
"""
2016-11-27 11:39:49 +00:00
def __init__ ( self , repository ) :
self . repository = repository
2023-05-19 18:54:45 +00:00
self . dir = get_security_dir ( repository . id_str , legacy = ( repository . version == 1 ) )
2017-05-10 13:30:51 +00:00
self . cache_dir = cache_dir ( repository )
2016-11-27 11:39:49 +00:00
self . key_type_file = os . path . join ( self . dir , " key-type " )
self . location_file = os . path . join ( self . dir , " location " )
self . manifest_ts_file = os . path . join ( self . dir , " manifest-timestamp " )
2017-12-16 21:59:47 +00:00
@staticmethod
def destroy ( repository , path = None ) :
""" destroy the security dir for ``repository`` or at ``path`` """
2023-05-19 18:54:45 +00:00
path = path or get_security_dir ( repository . id_str , legacy = ( repository . version == 1 ) )
2017-12-16 21:59:47 +00:00
if os . path . exists ( path ) :
shutil . rmtree ( path )
2016-11-27 11:39:49 +00:00
def known ( self ) :
2019-06-25 21:15:13 +00:00
return all ( os . path . exists ( f ) for f in ( self . key_type_file , self . location_file , self . manifest_ts_file ) )
2016-11-27 11:39:49 +00:00
def key_matches ( self , key ) :
if not self . known ( ) :
return False
try :
2022-02-27 18:31:33 +00:00
with open ( self . key_type_file ) as fd :
2016-11-27 11:39:49 +00:00
type = fd . read ( )
return type == str ( key . TYPE )
except OSError as exc :
logger . warning ( " Could not read/parse key type file: %s " , exc )
2017-05-10 13:30:51 +00:00
def save ( self , manifest , key ) :
2016-11-27 11:39:49 +00:00
logger . debug ( " security: saving state for %s to %s " , self . repository . id_str , self . dir )
2017-05-10 13:30:51 +00:00
current_location = self . repository . _location . canonical_path ( )
2016-11-27 11:39:49 +00:00
logger . debug ( " security: current location %s " , current_location )
logger . debug ( " security: key type %s " , str ( key . TYPE ) )
logger . debug ( " security: manifest timestamp %s " , manifest . timestamp )
2017-10-14 19:57:58 +00:00
with SaveFile ( self . location_file ) as fd :
2016-11-27 11:39:49 +00:00
fd . write ( current_location )
2017-10-14 19:57:58 +00:00
with SaveFile ( self . key_type_file ) as fd :
2016-11-27 11:39:49 +00:00
fd . write ( str ( key . TYPE ) )
2017-10-14 19:57:58 +00:00
with SaveFile ( self . manifest_ts_file ) as fd :
2016-11-27 11:39:49 +00:00
fd . write ( manifest . timestamp )
2017-05-10 16:34:22 +00:00
def assert_location_matches ( self , cache_config = None ) :
2016-11-27 11:39:49 +00:00
# Warn user before sending data to a relocated repository
try :
with open ( self . location_file ) as fd :
previous_location = fd . read ( )
2017-05-10 16:34:22 +00:00
logger . debug ( " security: read previous location %r " , previous_location )
2016-11-27 11:39:49 +00:00
except FileNotFoundError :
2017-05-10 16:34:22 +00:00
logger . debug ( " security: previous location file %s not found " , self . location_file )
2016-11-27 11:39:49 +00:00
previous_location = None
except OSError as exc :
logger . warning ( " Could not read previous location file: %s " , exc )
previous_location = None
2017-05-10 16:34:22 +00:00
if cache_config and cache_config . previous_location and previous_location != cache_config . previous_location :
2016-11-27 11:39:49 +00:00
# Reconcile cache and security dir; we take the cache location.
2017-05-10 13:30:51 +00:00
previous_location = cache_config . previous_location
2016-11-27 11:39:49 +00:00
logger . debug ( " security: using previous_location of cache: %r " , previous_location )
2017-05-10 13:30:51 +00:00
repository_location = self . repository . _location . canonical_path ( )
if previous_location and previous_location != repository_location :
2016-11-27 11:39:49 +00:00
msg = (
" Warning: The repository at location {} was previously located at {} \n " . format (
2017-05-10 13:30:51 +00:00
repository_location , previous_location
)
2016-11-27 11:39:49 +00:00
+ " Do you want to continue? [yN] "
)
if not yes (
msg ,
false_msg = " Aborting. " ,
invalid_msg = " Invalid answer, aborting. " ,
retry = False ,
env_var_override = " BORG_RELOCATED_REPO_ACCESS_IS_OK " ,
) :
raise Cache . RepositoryAccessAborted ( )
# adapt on-disk config immediately if the new location was accepted
logger . debug ( " security: updating location stored in cache and security dir " )
2017-10-14 19:57:58 +00:00
with SaveFile ( self . location_file ) as fd :
2017-05-10 13:30:51 +00:00
fd . write ( repository_location )
2017-05-10 16:34:22 +00:00
if cache_config :
cache_config . save ( )
2016-11-27 11:39:49 +00:00
2017-05-10 16:34:22 +00:00
def assert_no_manifest_replay ( self , manifest , key , cache_config = None ) :
2016-11-27 11:39:49 +00:00
try :
with open ( self . manifest_ts_file ) as fd :
timestamp = fd . read ( )
logger . debug ( " security: read manifest timestamp %r " , timestamp )
except FileNotFoundError :
logger . debug ( " security: manifest timestamp file %s not found " , self . manifest_ts_file )
timestamp = " "
except OSError as exc :
logger . warning ( " Could not read previous location file: %s " , exc )
timestamp = " "
2017-05-10 16:34:22 +00:00
if cache_config :
timestamp = max ( timestamp , cache_config . timestamp or " " )
2016-11-27 11:39:49 +00:00
logger . debug ( " security: determined newest manifest timestamp as %s " , timestamp )
# If repository is older than the cache or security dir something fishy is going on
if timestamp and timestamp > manifest . timestamp :
if isinstance ( key , PlaintextKey ) :
raise Cache . RepositoryIDNotUnique ( )
else :
raise Cache . RepositoryReplay ( )
2017-05-10 16:34:22 +00:00
def assert_key_type ( self , key , cache_config = None ) :
2016-11-27 11:39:49 +00:00
# Make sure an encrypted repository has not been swapped for an unencrypted repository
2017-05-10 16:34:22 +00:00
if cache_config and cache_config . key_type is not None and cache_config . key_type != str ( key . TYPE ) :
2016-11-27 11:39:49 +00:00
raise Cache . EncryptionMethodMismatch ( )
if self . known ( ) and not self . key_matches ( key ) :
raise Cache . EncryptionMethodMismatch ( )
2018-07-15 08:46:14 +00:00
def assert_secure ( self , manifest , key , * , cache_config = None , warn_if_unencrypted = True , lock_wait = None ) :
2017-05-10 16:34:22 +00:00
# warn_if_unencrypted=False is only used for initializing a new repository.
# Thus, avoiding asking about a repository that's currently initializing.
2017-05-10 13:30:51 +00:00
self . assert_access_unknown ( warn_if_unencrypted , manifest , key )
if cache_config :
self . _assert_secure ( manifest , key , cache_config )
else :
2018-07-15 08:46:14 +00:00
cache_config = CacheConfig ( self . repository , lock_wait = lock_wait )
2017-05-10 16:34:22 +00:00
if cache_config . exists ( ) :
with cache_config :
self . _assert_secure ( manifest , key , cache_config )
else :
self . _assert_secure ( manifest , key )
logger . debug ( " security: repository checks ok, allowing access " )
2017-05-10 13:30:51 +00:00
2017-05-10 16:34:22 +00:00
def _assert_secure ( self , manifest , key , cache_config = None ) :
2017-05-10 13:30:51 +00:00
self . assert_location_matches ( cache_config )
self . assert_key_type ( key , cache_config )
self . assert_no_manifest_replay ( manifest , key , cache_config )
2016-11-27 11:39:49 +00:00
if not self . known ( ) :
2017-05-10 16:34:22 +00:00
logger . debug ( " security: remembering previously unknown repository " )
2017-05-10 13:30:51 +00:00
self . save ( manifest , key )
2016-11-27 11:39:49 +00:00
2017-05-10 13:30:51 +00:00
def assert_access_unknown ( self , warn_if_unencrypted , manifest , key ) :
2017-05-10 16:34:22 +00:00
# warn_if_unencrypted=False is only used for initializing a new repository.
# Thus, avoiding asking about a repository that's currently initializing.
2017-05-10 13:30:51 +00:00
if not key . logically_encrypted and not self . known ( ) :
2016-11-27 11:39:49 +00:00
msg = (
" Warning: Attempting to access a previously unknown unencrypted repository! \n "
+ " Do you want to continue? [yN] "
)
2017-05-10 13:30:51 +00:00
allow_access = not warn_if_unencrypted or yes (
msg ,
false_msg = " Aborting. " ,
invalid_msg = " Invalid answer, aborting. " ,
retry = False ,
env_var_override = " BORG_UNKNOWN_UNENCRYPTED_REPO_ACCESS_IS_OK " ,
)
if allow_access :
if warn_if_unencrypted :
2017-05-10 16:34:22 +00:00
logger . debug ( " security: remembering unknown unencrypted repository (explicitly allowed) " )
2017-05-10 13:30:51 +00:00
else :
2017-05-10 16:34:22 +00:00
logger . debug ( " security: initializing unencrypted repository " )
2017-05-10 13:30:51 +00:00
self . save ( manifest , key )
else :
2016-11-27 11:39:49 +00:00
raise Cache . CacheInitAbortedError ( )
2018-07-15 08:46:14 +00:00
def assert_secure ( repository , manifest , lock_wait ) :
2017-05-10 13:30:51 +00:00
sm = SecurityManager ( repository )
2018-07-15 08:46:14 +00:00
sm . assert_secure ( manifest , manifest . key , lock_wait = lock_wait )
2017-05-10 13:30:51 +00:00
def recanonicalize_relative_location ( cache_location , repository ) :
# borg < 1.0.8rc1 had different canonicalization for the repo location (see #1655 and #1741).
repo_location = repository . _location . canonical_path ( )
rl = Location ( repo_location )
cl = Location ( cache_location )
if (
cl . proto == rl . proto
and cl . user == rl . user
and cl . host == rl . host
and cl . port == rl . port
and cl . path
and rl . path
and cl . path . startswith ( " /~/ " )
and rl . path . startswith ( " /./ " )
and cl . path [ 3 : ] == rl . path [ 3 : ]
) :
# everything is same except the expected change in relative path canonicalization,
# update previous_location to avoid warning / user query about changed location:
return repo_location
else :
return cache_location
def cache_dir ( repository , path = None ) :
return path or os . path . join ( get_cache_dir ( ) , repository . id_str )
2020-10-21 18:33:14 +00:00
def files_cache_name ( ) :
suffix = os . environ . get ( " BORG_FILES_CACHE_SUFFIX " , " " )
return " files. " + suffix if suffix else " files "
2022-04-18 07:03:37 +00:00
def discover_files_cache_name ( path ) :
return [ fn for fn in os . listdir ( path ) if fn == " files " or fn . startswith ( " files. " ) ] [ 0 ]
2017-05-10 13:30:51 +00:00
class CacheConfig :
def __init__ ( self , repository , path = None , lock_wait = None ) :
self . repository = repository
self . path = cache_dir ( repository , path )
2023-02-26 18:18:35 +00:00
logger . debug ( " Using %s as cache " , self . path )
2017-05-10 13:30:51 +00:00
self . config_path = os . path . join ( self . path , " config " )
self . lock = None
self . lock_wait = lock_wait
def __enter__ ( self ) :
self . open ( )
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
self . close ( )
2017-05-10 16:34:22 +00:00
def exists ( self ) :
return os . path . exists ( self . config_path )
2017-05-10 13:30:51 +00:00
def create ( self ) :
assert not self . exists ( )
config = configparser . ConfigParser ( interpolation = None )
config . add_section ( " cache " )
config . set ( " cache " , " version " , " 1 " )
config . set ( " cache " , " repository " , self . repository . id_str )
config . set ( " cache " , " manifest " , " " )
2017-05-25 11:43:15 +00:00
config . add_section ( " integrity " )
2017-05-25 14:13:40 +00:00
config . set ( " integrity " , " manifest " , " " )
2017-05-10 13:30:51 +00:00
with SaveFile ( self . config_path ) as fd :
config . write ( fd )
def open ( self ) :
2019-03-03 14:51:40 +00:00
self . lock = Lock ( os . path . join ( self . path , " lock " ) , exclusive = True , timeout = self . lock_wait ) . acquire ( )
2017-05-10 13:30:51 +00:00
self . load ( )
def load ( self ) :
self . _config = configparser . ConfigParser ( interpolation = None )
2019-01-27 00:36:52 +00:00
with open ( self . config_path ) as fd :
self . _config . read_file ( fd )
2017-05-10 13:30:51 +00:00
self . _check_upgrade ( self . config_path )
self . id = self . _config . get ( " cache " , " repository " )
self . manifest_id = unhexlify ( self . _config . get ( " cache " , " manifest " ) )
self . timestamp = self . _config . get ( " cache " , " timestamp " , fallback = None )
self . key_type = self . _config . get ( " cache " , " key_type " , fallback = None )
2017-05-28 16:04:33 +00:00
self . ignored_features = set ( parse_stringified_list ( self . _config . get ( " cache " , " ignored_features " , fallback = " " ) ) )
self . mandatory_features = set (
parse_stringified_list ( self . _config . get ( " cache " , " mandatory_features " , fallback = " " ) )
2022-07-06 13:37:27 +00:00
)
2017-05-25 11:43:15 +00:00
try :
self . integrity = dict ( self . _config . items ( " integrity " ) )
2017-05-25 14:13:40 +00:00
if self . _config . get ( " cache " , " manifest " ) != self . integrity . pop ( " manifest " ) :
# The cache config file is updated (parsed with ConfigParser, the state of the ConfigParser
# is modified and then written out.), not re-created.
# Thus, older versions will leave our [integrity] section alone, making the section's data invalid.
# Therefore, we also add the manifest ID to this section and
2017-05-31 16:08:02 +00:00
# can discern whether an older version interfered by comparing the manifest IDs of this section
2017-05-25 14:13:40 +00:00
# and the main [cache] section.
self . integrity = { }
2017-05-31 16:08:02 +00:00
logger . warning ( " Cache integrity data not available: old Borg version modified the cache. " )
2017-05-25 11:43:15 +00:00
except configparser . NoSectionError :
2017-05-25 14:13:40 +00:00
logger . debug ( " Cache integrity: No integrity data found (files, chunks). Cache is from old version. " )
2017-05-25 11:43:15 +00:00
self . integrity = { }
2017-05-10 13:30:51 +00:00
previous_location = self . _config . get ( " cache " , " previous_location " , fallback = None )
if previous_location :
self . previous_location = recanonicalize_relative_location ( previous_location , self . repository )
else :
self . previous_location = None
2018-03-09 18:18:25 +00:00
self . _config . set ( " cache " , " previous_location " , self . repository . _location . canonical_path ( ) )
2017-05-10 13:30:51 +00:00
def save ( self , manifest = None , key = None ) :
if manifest :
self . _config . set ( " cache " , " manifest " , manifest . id_str )
self . _config . set ( " cache " , " timestamp " , manifest . timestamp )
2017-05-28 16:04:33 +00:00
self . _config . set ( " cache " , " ignored_features " , " , " . join ( self . ignored_features ) )
self . _config . set ( " cache " , " mandatory_features " , " , " . join ( self . mandatory_features ) )
2017-05-25 14:13:40 +00:00
if not self . _config . has_section ( " integrity " ) :
self . _config . add_section ( " integrity " )
for file , integrity_data in self . integrity . items ( ) :
self . _config . set ( " integrity " , file , integrity_data )
self . _config . set ( " integrity " , " manifest " , manifest . id_str )
2017-05-10 13:30:51 +00:00
if key :
self . _config . set ( " cache " , " key_type " , str ( key . TYPE ) )
with SaveFile ( self . config_path ) as fd :
self . _config . write ( fd )
def close ( self ) :
if self . lock is not None :
self . lock . release ( )
self . lock = None
def _check_upgrade ( self , config_path ) :
try :
cache_version = self . _config . getint ( " cache " , " version " )
wanted_version = 1
if cache_version != wanted_version :
self . close ( )
raise Exception (
" %s has unexpected cache version %d (wanted: %d ). " % ( config_path , cache_version , wanted_version )
)
except configparser . NoSectionError :
self . close ( )
raise Exception ( " %s does not look like a Borg cache. " % config_path ) from None
2015-03-17 22:03:36 +00:00
class Cache :
2010-03-06 17:25:35 +00:00
""" Client Side cache """
2022-07-06 13:37:27 +00:00
2016-10-10 04:10:39 +00:00
class RepositoryIDNotUnique ( Error ) :
""" Cache is newer than repository - do you have multiple, independently updated repos with same ID? """
2013-12-15 19:35:29 +00:00
class RepositoryReplay ( Error ) :
2017-11-20 00:58:53 +00:00
""" Cache, or information obtained from the security directory is newer than repository - this is either an attack or unsafe (multiple repos with same ID) """
2013-08-09 21:23:00 +00:00
2015-04-06 21:07:10 +00:00
class CacheInitAbortedError ( Error ) :
""" Cache initialization aborted """
2015-04-13 20:35:09 +00:00
class RepositoryAccessAborted ( Error ) :
""" Repository access aborted """
2015-04-06 21:07:10 +00:00
class EncryptionMethodMismatch ( Error ) :
2015-10-31 21:23:32 +00:00
""" Repository encryption method changed since last access, refusing to continue """
2015-04-06 21:07:10 +00:00
2015-11-21 19:50:53 +00:00
@staticmethod
def break_lock ( repository , path = None ) :
2017-05-10 13:30:51 +00:00
path = cache_dir ( repository , path )
2016-07-23 11:56:06 +00:00
Lock ( os . path . join ( path , " lock " ) , exclusive = True ) . break_lock ( )
2015-11-21 19:50:53 +00:00
2016-04-11 10:50:39 +00:00
@staticmethod
def destroy ( repository , path = None ) :
""" destroy the cache for ``repository`` or at ``path`` """
2016-04-23 20:42:56 +00:00
path = path or os . path . join ( get_cache_dir ( ) , repository . id_str )
2016-04-11 10:50:39 +00:00
config = os . path . join ( path , " config " )
if os . path . exists ( config ) :
os . remove ( config ) # kill config first
shutil . rmtree ( path )
2018-12-11 22:04:18 +00:00
def __new__ (
cls ,
repository ,
manifest ,
path = None ,
sync = True ,
warn_if_unencrypted = True ,
2022-05-30 12:01:19 +00:00
progress = False ,
lock_wait = None ,
permit_adhoc_cache = False ,
cache_mode = FILES_CACHE_MODE_DISABLED ,
2021-03-20 23:33:31 +00:00
iec = False ,
) :
2017-06-10 15:59:41 +00:00
def local ( ) :
return LocalCache (
manifest = manifest ,
path = path ,
sync = sync ,
2021-03-20 23:33:31 +00:00
warn_if_unencrypted = warn_if_unencrypted ,
progress = progress ,
iec = iec ,
2019-04-20 20:22:26 +00:00
lock_wait = lock_wait ,
cache_mode = cache_mode ,
)
2017-06-10 15:59:41 +00:00
def adhoc ( ) :
2023-01-31 20:05:12 +00:00
return AdHocCache ( manifest = manifest , lock_wait = lock_wait , iec = iec )
2017-06-10 15:59:41 +00:00
if not permit_adhoc_cache :
return local ( )
# ad-hoc cache may be permitted, but if the local cache is in sync it'd be stupid to invalidate
# it by needlessly using the ad-hoc cache.
# Check if the local cache exists and is in sync.
cache_config = CacheConfig ( repository , path , lock_wait )
if cache_config . exists ( ) :
with cache_config :
cache_in_sync = cache_config . manifest_id == manifest . id
# Don't nest cache locks
if cache_in_sync :
# Local cache is in sync, use it
logger . debug ( " Cache: choosing local cache (in sync) " )
return local ( )
logger . debug ( " Cache: choosing ad-hoc cache (local cache does not exist or is not in sync) " )
return adhoc ( )
class CacheStatsMixin :
str_format = """ \
2022-06-23 12:13:19 +00:00
Original size : { 0. total_size }
Deduplicated size : { 0. unique_size }
Unique chunks : { 0. total_unique_chunks }
Total chunks : { 0. total_chunks }
"""
2017-06-10 15:59:41 +00:00
2021-03-20 23:33:31 +00:00
def __init__ ( self , iec = False ) :
self . iec = iec
2017-06-10 15:59:41 +00:00
def __str__ ( self ) :
return self . str_format . format ( self . format_tuple ( ) )
2022-06-10 13:59:29 +00:00
Summary = namedtuple ( " Summary " , [ " total_size " , " unique_size " , " total_unique_chunks " , " total_chunks " ] )
2017-06-10 15:59:41 +00:00
def stats ( self ) :
2019-04-20 20:22:26 +00:00
from . archive import Archive
2022-07-06 13:37:27 +00:00
2017-06-10 15:59:41 +00:00
# XXX: this should really be moved down to `hashindex.pyx`
2022-06-10 18:58:33 +00:00
total_size , unique_size , total_unique_chunks , total_chunks = self . chunks . summarize ( )
2022-06-10 13:59:29 +00:00
# since borg 1.2 we have new archive metadata telling the total size per archive,
# so we can just sum up all archives to get the "all archives" stats:
total_size = 0
2019-04-20 20:22:26 +00:00
for archive_name in self . manifest . archives :
2023-01-31 20:05:12 +00:00
archive = Archive ( self . manifest , archive_name )
2019-04-20 20:22:26 +00:00
stats = archive . calc_stats ( self , want_unique = False )
total_size + = stats . osize
2022-06-10 13:59:29 +00:00
stats = self . Summary ( total_size , unique_size , total_unique_chunks , total_chunks ) . _asdict ( )
2017-06-10 15:59:41 +00:00
return stats
def format_tuple ( self ) :
stats = self . stats ( )
2022-06-11 20:29:43 +00:00
for field in [ " total_size " , " unique_size " ] :
2021-03-20 23:33:31 +00:00
stats [ field ] = format_file_size ( stats [ field ] , iec = self . iec )
2017-06-10 15:59:41 +00:00
return self . Summary ( * * stats )
class LocalCache ( CacheStatsMixin ) :
"""
Persistent , local ( client - side ) cache .
"""
2018-03-08 02:39:38 +00:00
def __init__ (
self ,
manifest ,
path = None ,
sync = True ,
warn_if_unencrypted = True ,
2022-05-30 12:01:19 +00:00
progress = False ,
lock_wait = None ,
cache_mode = FILES_CACHE_MODE_DISABLED ,
2021-03-20 23:33:31 +00:00
iec = False ,
) :
2016-03-15 14:38:55 +00:00
"""
: param warn_if_unencrypted : print warning if accessing unknown unencrypted repository
2018-07-15 08:46:14 +00:00
: param lock_wait : timeout for lock acquisition ( int [ s ] or None [ wait forever ] )
2016-03-15 14:38:55 +00:00
: param sync : do : meth : ` . sync `
2018-03-08 02:20:56 +00:00
: param cache_mode : what shall be compared in the file stat infos vs . cached stat infos comparison
2016-03-15 14:38:55 +00:00
"""
2021-03-20 23:33:31 +00:00
CacheStatsMixin . __init__ ( self , iec = iec )
2022-08-23 01:25:06 +00:00
assert isinstance ( manifest , Manifest )
2011-09-04 21:02:47 +00:00
self . manifest = manifest
2022-08-23 01:25:06 +00:00
self . repository = manifest . repository
self . key = manifest . key
self . repo_objs = manifest . repo_objs
2016-11-26 20:15:59 +00:00
self . progress = progress
2018-03-08 02:20:56 +00:00
self . cache_mode = cache_mode
2017-05-10 13:30:51 +00:00
self . timestamp = None
self . txn_active = False
2022-08-23 01:25:06 +00:00
self . path = cache_dir ( self . repository , path )
self . security_manager = SecurityManager ( self . repository )
2017-05-10 13:30:51 +00:00
self . cache_config = CacheConfig ( self . repository , self . path , lock_wait )
2015-04-13 20:35:09 +00:00
# Warn user before sending data to a never seen before unencrypted repository
2010-12-21 20:29:09 +00:00
if not os . path . exists ( self . path ) :
2022-08-23 01:25:06 +00:00
self . security_manager . assert_access_unknown ( warn_if_unencrypted , manifest , self . key )
2010-12-21 20:29:09 +00:00
self . create ( )
2017-05-10 13:30:51 +00:00
self . open ( )
2016-02-04 22:19:35 +00:00
try :
2022-08-23 01:25:06 +00:00
self . security_manager . assert_secure ( manifest , self . key , cache_config = self . cache_config )
2017-05-28 16:04:33 +00:00
if not self . check_cache_compatibility ( ) :
self . wipe_cache ( )
self . update_compatibility ( )
2017-05-10 13:30:51 +00:00
if sync and self . manifest . id != self . cache_config . manifest_id :
2016-02-04 22:19:35 +00:00
self . sync ( )
self . commit ( )
2023-07-25 21:11:27 +00:00
except : # noqa
2016-02-04 22:19:35 +00:00
self . close ( )
raise
2010-03-06 17:25:35 +00:00
2016-01-17 00:09:13 +00:00
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
2013-06-24 20:41:05 +00:00
self . close ( )
2010-12-21 20:29:09 +00:00
def create ( self ) :
2015-03-09 15:02:06 +00:00
""" Create a new empty cache at `self.path` """
2011-01-04 22:00:39 +00:00
os . makedirs ( self . path )
2013-06-03 11:45:48 +00:00
with open ( os . path . join ( self . path , " README " ) , " w " ) as fd :
2016-11-11 20:24:16 +00:00
fd . write ( CACHE_README )
2017-05-10 13:30:51 +00:00
self . cache_config . create ( )
2017-05-25 11:43:15 +00:00
ChunkIndex ( ) . write ( os . path . join ( self . path , " chunks " ) )
2015-08-30 01:03:48 +00:00
os . makedirs ( os . path . join ( self . path , " chunks.archive.d " ) )
2020-10-21 18:33:14 +00:00
with SaveFile ( os . path . join ( self . path , files_cache_name ( ) ) , binary = True ) :
2012-10-17 09:40:23 +00:00
pass # empty file
2010-12-21 20:29:09 +00:00
2016-10-18 19:36:23 +00:00
def _do_open ( self ) :
2017-05-10 13:30:51 +00:00
self . cache_config . load ( )
2017-05-25 11:43:15 +00:00
with IntegrityCheckedFile (
path = os . path . join ( self . path , " chunks " ) ,
write = False ,
integrity_data = self . cache_config . integrity . get ( " chunks " ) ,
) as fd :
self . chunks = ChunkIndex . read ( fd )
2018-03-08 02:39:38 +00:00
if " d " in self . cache_mode : # d(isabled)
2018-03-08 02:20:56 +00:00
self . files = None
else :
self . _read_files ( )
2011-07-02 18:39:35 +00:00
2017-05-10 13:30:51 +00:00
def open ( self ) :
2015-04-19 21:45:05 +00:00
if not os . path . isdir ( self . path ) :
2015-05-09 16:40:55 +00:00
raise Exception ( " %s Does not look like a Borg cache " % self . path )
2017-05-10 13:30:51 +00:00
self . cache_config . open ( )
2015-04-19 21:45:05 +00:00
self . rollback ( )
2013-06-24 20:41:05 +00:00
def close ( self ) :
2017-05-10 13:30:51 +00:00
if self . cache_config is not None :
self . cache_config . close ( )
self . cache_config = None
2013-06-24 20:41:05 +00:00
2011-07-02 18:39:35 +00:00
def _read_files ( self ) :
2011-07-06 20:23:41 +00:00
self . files = { }
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
self . _newest_cmtime = None
2016-03-26 13:31:54 +00:00
logger . debug ( " Reading files cache ... " )
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " FILES-CACHE-LOAD: starting... " )
2018-01-20 05:36:54 +00:00
msg = None
try :
2020-10-21 18:33:14 +00:00
with IntegrityCheckedFile (
path = os . path . join ( self . path , files_cache_name ( ) ) ,
write = False ,
integrity_data = self . cache_config . integrity . get ( files_cache_name ( ) ) ,
) as fd :
2018-01-20 05:36:54 +00:00
u = msgpack . Unpacker ( use_list = True )
while True :
data = fd . read ( 64 * 1024 )
if not data :
break
u . feed ( data )
try :
for path_hash , item in u :
entry = FileCacheEntry ( * item )
# in the end, this takes about 240 Bytes per file
self . files [ path_hash ] = msgpack . packb ( entry . _replace ( age = entry . age + 1 ) )
except ( TypeError , ValueError ) as exc :
msg = " The files cache seems invalid. [ %s ] " % str ( exc )
break
except OSError as exc :
msg = " The files cache can ' t be read. [ %s ] " % str ( exc )
except FileIntegrityError as fie :
msg = " The files cache is corrupted. [ %s ] " % str ( fie )
if msg is not None :
logger . warning ( msg )
logger . warning ( " Continuing without files cache - expect lower performance. " )
self . files = { }
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " FILES-CACHE-LOAD: finished, %d entries loaded. " , len ( self . files ) )
2010-12-21 20:29:09 +00:00
def begin_txn ( self ) :
# Initialize transaction snapshot
2017-02-27 19:38:02 +00:00
pi = ProgressIndicatorMessage ( msgid = " cache.begin_transaction " )
2010-12-21 20:29:09 +00:00
txn_dir = os . path . join ( self . path , " txn.tmp " )
os . mkdir ( txn_dir )
2016-11-26 00:28:43 +00:00
pi . output ( " Initializing cache transaction: Reading config " )
2010-12-21 20:29:09 +00:00
shutil . copy ( os . path . join ( self . path , " config " ) , txn_dir )
2016-11-26 00:28:43 +00:00
pi . output ( " Initializing cache transaction: Reading chunks " )
2010-12-21 20:29:09 +00:00
shutil . copy ( os . path . join ( self . path , " chunks " ) , txn_dir )
2016-11-26 00:28:43 +00:00
pi . output ( " Initializing cache transaction: Reading files " )
2020-10-21 19:00:26 +00:00
try :
shutil . copy ( os . path . join ( self . path , files_cache_name ( ) ) , txn_dir )
except FileNotFoundError :
with SaveFile ( os . path . join ( txn_dir , files_cache_name ( ) ) , binary = True ) :
pass # empty file
2023-01-18 14:14:04 +00:00
os . replace ( txn_dir , os . path . join ( self . path , " txn.active " ) )
2010-12-21 20:29:09 +00:00
self . txn_active = True
2016-11-26 00:28:43 +00:00
pi . finish ( )
2010-12-21 20:29:09 +00:00
def commit ( self ) :
""" Commit transaction """
2011-01-04 22:16:55 +00:00
if not self . txn_active :
return
2017-05-10 13:30:51 +00:00
self . security_manager . save ( self . manifest , self . key )
2017-02-27 19:38:02 +00:00
pi = ProgressIndicatorMessage ( msgid = " cache.commit " )
2011-07-06 20:11:01 +00:00
if self . files is not None :
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
if self . _newest_cmtime is None :
2016-11-19 23:08:33 +00:00
# was never set because no files were modified/added
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
self . _newest_cmtime = 2 * * 63 - 1 # nanoseconds, good until y2262
2016-07-30 23:33:46 +00:00
ttl = int ( os . environ . get ( " BORG_FILES_CACHE_TTL " , 20 ) )
2016-11-28 01:23:32 +00:00
pi . output ( " Saving files cache " )
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " FILES-CACHE-SAVE: starting... " )
2020-10-21 18:33:14 +00:00
with IntegrityCheckedFile ( path = os . path . join ( self . path , files_cache_name ( ) ) , write = True ) as fd :
2021-02-23 21:56:38 +00:00
entry_count = 0
2014-06-03 21:10:52 +00:00
for path_hash , item in self . files . items ( ) :
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
# Only keep files seen in this backup that are older than newest cmtime seen in this backup -
# this is to avoid issues with filesystem snapshots and cmtime granularity.
2016-08-03 22:06:15 +00:00
# Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet.
2016-04-16 15:48:47 +00:00
entry = FileCacheEntry ( * msgpack . unpackb ( item ) )
2022-05-04 08:34:33 +00:00
if (
entry . age == 0
and timestamp_to_int ( entry . cmtime ) < self . _newest_cmtime
2016-08-05 23:24:22 +00:00
or entry . age > 0
and entry . age < ttl
) :
2016-04-16 15:48:47 +00:00
msgpack . pack ( ( path_hash , entry ) , fd )
2021-02-23 21:56:38 +00:00
entry_count + = 1
files_cache_logger . debug ( " FILES-CACHE-KILL: removed all old entries with age >= TTL [ %d ] " , ttl )
files_cache_logger . debug (
" FILES-CACHE-KILL: removed all current entries with newest cmtime %d " , self . _newest_cmtime
)
files_cache_logger . debug ( " FILES-CACHE-SAVE: finished, %d remaining entries saved. " , entry_count )
2020-10-21 18:33:14 +00:00
self . cache_config . integrity [ files_cache_name ( ) ] = fd . integrity_data
2017-05-25 11:43:15 +00:00
pi . output ( " Saving chunks cache " )
with IntegrityCheckedFile ( path = os . path . join ( self . path , " chunks " ) , write = True ) as fd :
self . chunks . write ( fd )
self . cache_config . integrity [ " chunks " ] = fd . integrity_data
2016-11-23 23:46:50 +00:00
pi . output ( " Saving cache config " )
2017-05-10 13:30:51 +00:00
self . cache_config . save ( self . manifest , self . key )
2023-01-18 14:14:04 +00:00
os . replace ( os . path . join ( self . path , " txn.active " ) , os . path . join ( self . path , " txn.tmp " ) )
2010-12-21 20:29:09 +00:00
shutil . rmtree ( os . path . join ( self . path , " txn.tmp " ) )
self . txn_active = False
2016-11-23 23:46:50 +00:00
pi . finish ( )
2010-12-21 20:29:09 +00:00
def rollback ( self ) :
""" Roll back partial and aborted transactions """
2014-03-30 20:46:57 +00:00
# Remove partial transaction
if os . path . exists ( os . path . join ( self . path , " txn.tmp " ) ) :
shutil . rmtree ( os . path . join ( self . path , " txn.tmp " ) )
2010-12-21 20:29:09 +00:00
# Roll back active transaction
txn_dir = os . path . join ( self . path , " txn.active " )
if os . path . exists ( txn_dir ) :
shutil . copy ( os . path . join ( txn_dir , " config " ) , self . path )
shutil . copy ( os . path . join ( txn_dir , " chunks " ) , self . path )
2022-04-18 07:03:37 +00:00
shutil . copy ( os . path . join ( txn_dir , discover_files_cache_name ( txn_dir ) ) , self . path )
2023-01-18 14:14:04 +00:00
txn_tmp = os . path . join ( self . path , " txn.tmp " )
os . replace ( txn_dir , txn_tmp )
if os . path . exists ( txn_tmp ) :
shutil . rmtree ( txn_tmp )
2010-12-21 20:29:09 +00:00
self . txn_active = False
2015-04-19 21:45:05 +00:00
self . _do_open ( )
2010-12-21 20:29:09 +00:00
def sync ( self ) :
2015-05-23 15:07:22 +00:00
""" Re-synchronize chunks cache with repository.
2015-08-30 01:03:48 +00:00
Maintains a directory with known backup archive indexes , so it only
needs to fetch infos from repo and build a chunk index once per backup
archive .
If out of sync , missing archive indexes get added , outdated indexes
get removed and a new master chunks index is built by merging all
archive indexes .
2010-03-06 17:25:35 +00:00
"""
2015-08-30 01:03:48 +00:00
archive_path = os . path . join ( self . path , " chunks.archive.d " )
2017-05-27 19:50:28 +00:00
# Instrumentation
processed_item_metadata_bytes = 0
processed_item_metadata_chunks = 0
compact_chunks_archive_saved_space = 0
2015-05-23 15:07:22 +00:00
2015-08-30 01:03:48 +00:00
def mkpath ( id , suffix = " " ) :
2016-04-23 20:42:56 +00:00
id_hex = bin_to_hex ( id )
2015-08-30 13:15:15 +00:00
path = os . path . join ( archive_path , id_hex + suffix )
2017-05-25 11:49:03 +00:00
return path
2015-05-23 15:07:22 +00:00
2015-08-30 13:15:15 +00:00
def cached_archives ( ) :
2015-10-02 14:56:31 +00:00
if self . do_cache :
fns = os . listdir ( archive_path )
2017-05-27 19:50:28 +00:00
# filenames with 64 hex digits == 256bit,
# or compact indices which are 64 hex digits + ".compact"
2022-02-27 18:31:33 +00:00
return { unhexlify ( fn ) for fn in fns if len ( fn ) == 64 } | {
unhexlify ( fn [ : 64 ] ) for fn in fns if len ( fn ) == 72 and fn . endswith ( " .compact " )
}
2015-10-02 14:56:31 +00:00
else :
return set ( )
2015-08-30 13:15:15 +00:00
def repo_archives ( ) :
2022-02-27 18:31:33 +00:00
return { info . id for info in self . manifest . archives . list ( ) }
2015-08-30 01:03:48 +00:00
def cleanup_outdated ( ids ) :
for id in ids :
2017-05-25 12:00:03 +00:00
cleanup_cached_archive ( id )
2017-06-09 09:07:17 +00:00
def cleanup_cached_archive ( id , cleanup_compact = True ) :
2017-05-25 12:00:03 +00:00
try :
2017-05-27 19:50:28 +00:00
os . unlink ( mkpath ( id ) )
2017-05-25 12:00:03 +00:00
os . unlink ( mkpath ( id ) + " .integrity " )
except FileNotFoundError :
pass
2017-06-09 09:07:17 +00:00
if not cleanup_compact :
return
2017-05-27 19:50:28 +00:00
try :
os . unlink ( mkpath ( id , suffix = " .compact " ) )
os . unlink ( mkpath ( id , suffix = " .compact " ) + " .integrity " )
except FileNotFoundError :
pass
2015-08-30 13:15:15 +00:00
2017-05-31 18:46:57 +00:00
def fetch_and_build_idx ( archive_id , decrypted_repository , chunk_idx ) :
2017-05-27 19:50:28 +00:00
nonlocal processed_item_metadata_bytes
nonlocal processed_item_metadata_chunks
2017-05-28 11:16:52 +00:00
csize , data = decrypted_repository . get ( archive_id )
2022-06-10 20:23:27 +00:00
chunk_idx . add ( archive_id , 1 , len ( data ) )
2023-09-03 19:49:18 +00:00
archive , _ = self . key . unpack_and_verify_archive ( data )
2023-06-16 20:40:51 +00:00
archive = ArchiveItem ( internal_dict = archive )
2022-05-17 00:00:00 +00:00
if archive . version not in ( 1 , 2 ) : # legacy
2014-01-22 21:10:36 +00:00
raise Exception ( " Unknown archive metadata version " )
2022-08-05 20:06:08 +00:00
if archive . version == 1 :
items = archive . items
elif archive . version == 2 :
items = [ ]
for chunk_id , ( csize , data ) in zip ( archive . item_ptrs , decrypted_repository . get_many ( archive . item_ptrs ) ) :
chunk_idx . add ( chunk_id , 1 , len ( data ) )
ids = msgpack . unpackb ( data )
items . extend ( ids )
2017-03-07 14:13:59 +00:00
sync = CacheSynchronizer ( chunk_idx )
2022-08-05 20:06:08 +00:00
for item_id , ( csize , data ) in zip ( items , decrypted_repository . get_many ( items ) ) :
2022-06-10 20:23:27 +00:00
chunk_idx . add ( item_id , 1 , len ( data ) )
2017-05-27 19:50:28 +00:00
processed_item_metadata_bytes + = len ( data )
processed_item_metadata_chunks + = 1
2017-03-07 14:13:59 +00:00
sync . feed ( data )
2015-10-02 14:56:31 +00:00
if self . do_cache :
2017-06-09 09:07:17 +00:00
write_archive_index ( archive_id , chunk_idx )
def write_archive_index ( archive_id , chunk_idx ) :
nonlocal compact_chunks_archive_saved_space
compact_chunks_archive_saved_space + = chunk_idx . compact ( )
fn = mkpath ( archive_id , suffix = " .compact " )
fn_tmp = mkpath ( archive_id , suffix = " .tmp " )
try :
with DetachedIntegrityCheckedFile (
2017-06-09 09:13:13 +00:00
path = fn_tmp , write = True , filename = bin_to_hex ( archive_id ) + " .compact "
) as fd :
2017-06-09 09:07:17 +00:00
chunk_idx . write ( fd )
except Exception :
2022-02-15 18:39:58 +00:00
safe_unlink ( fn_tmp )
2017-06-09 09:07:17 +00:00
else :
2023-01-18 14:14:04 +00:00
os . replace ( fn_tmp , fn )
2015-08-30 13:15:15 +00:00
2017-06-09 09:10:49 +00:00
def read_archive_index ( archive_id , archive_name ) :
archive_chunk_idx_path = mkpath ( archive_id )
2023-01-17 20:21:18 +00:00
logger . info ( " Reading cached archive chunk index for %s " , archive_name )
2017-06-09 09:10:49 +00:00
try :
try :
# Attempt to load compact index first
with DetachedIntegrityCheckedFile ( path = archive_chunk_idx_path + " .compact " , write = False ) as fd :
archive_chunk_idx = ChunkIndex . read ( fd , permit_compact = True )
# In case a non-compact index exists, delete it.
cleanup_cached_archive ( archive_id , cleanup_compact = False )
# Compact index read - return index, no conversion necessary (below).
return archive_chunk_idx
except FileNotFoundError :
# No compact index found, load non-compact index, and convert below.
with DetachedIntegrityCheckedFile ( path = archive_chunk_idx_path , write = False ) as fd :
archive_chunk_idx = ChunkIndex . read ( fd )
except FileIntegrityError as fie :
logger . error ( " Cached archive chunk index of %s is corrupted: %s " , archive_name , fie )
# Delete corrupted index, set warning. A new index must be build.
cleanup_cached_archive ( archive_id )
set_ec ( EXIT_WARNING )
return None
# Convert to compact index. Delete the existing index first.
logger . debug ( " Found non-compact index for %s , converting to compact. " , archive_name )
cleanup_cached_archive ( archive_id )
write_archive_index ( archive_id , archive_chunk_idx )
return archive_chunk_idx
2017-05-26 11:54:28 +00:00
def get_archive_ids_to_names ( archive_ids ) :
# Pass once over all archives and build a mapping from ids to names.
# The easier approach, doing a similar loop for each archive, has
# square complexity and does about a dozen million functions calls
# with 1100 archives (which takes 30s CPU seconds _alone_).
archive_names = { }
2016-08-15 02:17:41 +00:00
for info in self . manifest . archives . list ( ) :
2017-05-26 11:54:28 +00:00
if info . id in archive_ids :
archive_names [ info . id ] = info . name
2017-05-31 18:46:57 +00:00
assert len ( archive_names ) == len ( archive_ids )
2017-05-26 11:54:28 +00:00
return archive_names
2015-05-23 15:07:22 +00:00
2015-08-30 01:03:48 +00:00
def create_master_idx ( chunk_idx ) :
2023-01-18 21:59:44 +00:00
logger . debug ( " Synchronizing chunks index... " )
2015-08-30 13:15:15 +00:00
cached_ids = cached_archives ( )
archive_ids = repo_archives ( )
2017-05-25 12:01:21 +00:00
logger . info (
2023-01-17 20:21:18 +00:00
" Cached archive chunk indexes: %d fresh, %d stale, %d need fetching. " ,
len ( archive_ids & cached_ids ) ,
2017-05-25 12:01:21 +00:00
len ( cached_ids - archive_ids ) ,
len ( archive_ids - cached_ids ) ,
)
2015-08-30 01:03:48 +00:00
# deallocates old hashindex, creates empty hashindex:
2015-05-23 15:07:22 +00:00
chunk_idx . clear ( )
2015-08-30 13:15:15 +00:00
cleanup_outdated ( cached_ids - archive_ids )
2018-06-12 20:12:02 +00:00
# Explicitly set the usable initial hash table capacity to avoid performance issues
2017-05-26 10:30:15 +00:00
# due to hash table "resonance".
2018-06-12 20:12:02 +00:00
master_index_capacity = len ( self . repository )
2015-08-30 13:15:15 +00:00
if archive_ids :
2018-06-12 20:12:02 +00:00
chunk_idx = None if not self . do_cache else ChunkIndex ( usable = master_index_capacity )
2017-05-31 18:46:57 +00:00
pi = ProgressIndicatorPercent (
total = len ( archive_ids ) ,
step = 0.1 ,
2023-01-18 21:59:44 +00:00
msg = " %3.0f %% Syncing chunks index. Processing archive %s . " ,
2017-05-31 18:46:57 +00:00
msgid = " cache.sync " ,
)
2017-05-26 11:54:28 +00:00
archive_ids_to_names = get_archive_ids_to_names ( archive_ids )
2017-05-31 18:46:57 +00:00
for archive_id , archive_name in archive_ids_to_names . items ( ) :
2023-01-08 19:11:01 +00:00
pi . show ( info = [ remove_surrogates ( archive_name ) ] ) # legacy. borg2 always has pure unicode arch names.
2016-12-03 11:06:22 +00:00
if self . do_cache :
if archive_id in cached_ids :
2017-06-09 09:10:49 +00:00
archive_chunk_idx = read_archive_index ( archive_id , archive_name )
if archive_chunk_idx is None :
2017-05-25 12:00:03 +00:00
cached_ids . remove ( archive_id )
if archive_id not in cached_ids :
# Do not make this an else branch; the FileIntegrityError exception handler
# above can remove *archive_id* from *cached_ids*.
2023-01-18 21:59:44 +00:00
logger . info ( " Fetching and building archive index for %s . " , archive_name )
2017-05-26 20:54:27 +00:00
archive_chunk_idx = ChunkIndex ( )
2017-05-31 18:46:57 +00:00
fetch_and_build_idx ( archive_id , decrypted_repository , archive_chunk_idx )
2023-01-18 21:59:44 +00:00
logger . debug ( " Merging into master chunks index. " )
2017-05-27 19:50:28 +00:00
chunk_idx . merge ( archive_chunk_idx )
2015-08-30 01:03:48 +00:00
else :
2018-06-12 20:12:02 +00:00
chunk_idx = chunk_idx or ChunkIndex ( usable = master_index_capacity )
2023-01-18 21:59:44 +00:00
logger . info ( " Fetching archive index for %s . " , archive_name )
2017-05-31 18:46:57 +00:00
fetch_and_build_idx ( archive_id , decrypted_repository , chunk_idx )
pi . finish ( )
2017-06-10 15:59:41 +00:00
logger . debug (
2023-01-19 00:51:01 +00:00
" Chunks index sync: processed %s ( %d chunks) of metadata. " ,
2017-05-27 19:50:28 +00:00
format_file_size ( processed_item_metadata_bytes ) ,
processed_item_metadata_chunks ,
)
logger . debug (
2023-01-19 00:51:01 +00:00
" Chunks index sync: compact chunks.archive.d storage saved %s bytes. " ,
2017-05-27 19:50:28 +00:00
format_file_size ( compact_chunks_archive_saved_space ) ,
)
2023-01-19 00:51:01 +00:00
logger . debug ( " Chunks index sync done. " )
2015-08-30 01:03:48 +00:00
return chunk_idx
2017-06-11 10:37:20 +00:00
# The cache can be used by a command that e.g. only checks against Manifest.Operation.WRITE,
# which does not have to include all flags from Manifest.Operation.READ.
# Since the sync will attempt to read archives, check compatibility with Manifest.Operation.READ.
self . manifest . check_repository_compatibility ( ( Manifest . Operation . READ , ) )
2015-05-23 15:07:22 +00:00
self . begin_txn ( )
2022-08-23 01:25:06 +00:00
with cache_if_remote ( self . repository , decrypted_cache = self . repo_objs ) as decrypted_repository :
2023-07-25 23:10:24 +00:00
# TEMPORARY HACK:
# to avoid archive index caching, create a FILE named ~/.cache/borg/REPOID/chunks.archive.d -
# this is only recommended if you have a fast, low latency connection to your repo (e.g. if repo is local).
2016-01-16 22:42:54 +00:00
self . do_cache = os . path . isdir ( archive_path )
self . chunks = create_master_idx ( self . chunks )
2010-03-06 17:25:35 +00:00
2017-05-28 16:04:33 +00:00
def check_cache_compatibility ( self ) :
my_features = Manifest . SUPPORTED_REPO_FEATURES
if self . cache_config . ignored_features & my_features :
# The cache might not contain references of chunks that need a feature that is mandatory for some operation
# and which this version supports. To avoid corruption while executing that operation force rebuild.
return False
if not self . cache_config . mandatory_features < = my_features :
# The cache was build with consideration to at least one feature that this version does not understand.
# This client might misinterpret the cache. Thus force a rebuild.
return False
return True
def wipe_cache ( self ) :
logger . warning ( " Discarding incompatible cache and forcing a cache rebuild " )
archive_path = os . path . join ( self . path , " chunks.archive.d " )
if os . path . isdir ( archive_path ) :
shutil . rmtree ( os . path . join ( self . path , " chunks.archive.d " ) )
os . makedirs ( os . path . join ( self . path , " chunks.archive.d " ) )
self . chunks = ChunkIndex ( )
2020-10-21 18:33:14 +00:00
with SaveFile ( os . path . join ( self . path , files_cache_name ( ) ) , binary = True ) :
2017-05-28 16:04:33 +00:00
pass # empty file
self . cache_config . manifest_id = " "
self . cache_config . _config . set ( " cache " , " manifest " , " " )
self . cache_config . ignored_features = set ( )
self . cache_config . mandatory_features = set ( )
def update_compatibility ( self ) :
operation_to_features_map = self . manifest . get_all_mandatory_features ( )
my_features = Manifest . SUPPORTED_REPO_FEATURES
repo_features = set ( )
for operation , features in operation_to_features_map . items ( ) :
repo_features . update ( features )
self . cache_config . ignored_features . update ( repo_features - my_features )
self . cache_config . mandatory_features . update ( repo_features & my_features )
2023-09-15 20:19:29 +00:00
def add_chunk (
self ,
id ,
meta ,
data ,
* ,
stats ,
wait = True ,
compress = True ,
size = None ,
ctype = None ,
clevel = None ,
ro_type = ROBJ_FILE_STREAM ,
) :
assert ro_type is not None
2010-12-21 20:29:09 +00:00
if not self . txn_active :
self . begin_txn ( )
2022-05-03 02:15:01 +00:00
if size is None and compress :
2022-09-05 00:53:28 +00:00
size = len ( data ) # data is still uncompressed
2016-04-07 09:29:52 +00:00
refcount = self . seen_chunk ( id , size )
2023-09-22 22:10:35 +00:00
if refcount :
2015-09-05 23:10:43 +00:00
return self . chunk_incref ( id , stats )
2022-05-03 02:15:01 +00:00
if size is None :
raise ValueError ( " when giving compressed data for a new chunk, the uncompressed size must be given also " )
2023-09-15 20:19:29 +00:00
cdata = self . repo_objs . format (
id , meta , data , compress = compress , size = size , ctype = ctype , clevel = clevel , ro_type = ro_type
)
2022-09-05 00:53:28 +00:00
self . repository . put ( id , cdata , wait = wait )
2022-06-10 20:23:27 +00:00
self . chunks . add ( id , 1 , size )
2022-06-11 20:29:43 +00:00
stats . update ( size , not refcount )
2022-06-10 18:36:58 +00:00
return ChunkListEntry ( id , size )
2010-03-06 17:25:35 +00:00
2015-09-05 23:10:43 +00:00
def seen_chunk ( self , id , size = None ) :
2022-06-10 20:23:27 +00:00
refcount , stored_size = self . chunks . get ( id , ChunkIndexEntry ( 0 , None ) )
2015-09-05 23:10:43 +00:00
if size is not None and stored_size is not None and size != stored_size :
# we already have a chunk with that id, but different size.
# this is either a hash collision (unlikely) or corruption or a bug.
raise Exception (
" chunk has same id [ %r ], but different size (stored: %d new: %d )! " % ( id , stored_size , size )
)
return refcount
2010-03-06 17:25:35 +00:00
2023-01-31 20:05:12 +00:00
def chunk_incref ( self , id , stats , size = None ) :
2010-12-21 20:29:09 +00:00
if not self . txn_active :
self . begin_txn ( )
2022-06-10 20:23:27 +00:00
count , _size = self . chunks . incref ( id )
2023-01-31 20:05:12 +00:00
stats . update ( _size , False )
2022-06-10 18:36:58 +00:00
return ChunkListEntry ( id , _size )
2010-03-06 17:25:35 +00:00
2023-01-31 20:05:12 +00:00
def chunk_decref ( self , id , stats , wait = True ) :
2010-12-21 20:29:09 +00:00
if not self . txn_active :
self . begin_txn ( )
2022-06-10 20:23:27 +00:00
count , size = self . chunks . decref ( id )
2016-04-11 22:10:44 +00:00
if count == 0 :
2010-12-21 20:29:09 +00:00
del self . chunks [ id ]
2017-03-05 04:19:32 +00:00
self . repository . delete ( id , wait = wait )
2023-01-31 20:05:12 +00:00
stats . update ( - size , True )
2010-03-15 20:23:34 +00:00
else :
2023-01-31 20:05:12 +00:00
stats . update ( - size , False )
2010-10-13 20:07:55 +00:00
2021-02-23 21:56:38 +00:00
def file_known_and_unchanged ( self , hashed_path , path_hash , st ) :
2018-02-23 13:48:24 +00:00
"""
Check if we know the file that has this path_hash ( know == it is in our files cache ) and
whether it is unchanged ( the size / inode number / cmtime is same for stuff we check in this cache_mode ) .
2021-02-23 21:56:38 +00:00
: param hashed_path : the file ' s path as we gave it to hash(hashed_path)
: param path_hash : hash ( hashed_path ) , to save some memory in the files cache
2018-02-23 13:48:24 +00:00
: param st : the file ' s stat() result
: return : known , ids ( known is True if we have infos about this file in the cache ,
ids is the list of chunk ids IF the file has not changed , otherwise None ) .
"""
2021-02-23 21:56:38 +00:00
if not stat . S_ISREG ( st . st_mode ) :
return False , None
2018-03-08 02:20:56 +00:00
cache_mode = self . cache_mode
2021-02-23 21:56:38 +00:00
if " d " in cache_mode : # d(isabled)
files_cache_logger . debug ( " UNKNOWN: files cache disabled " )
2018-02-23 13:48:24 +00:00
return False , None
# note: r(echunk) does not need the files cache in this method, but the files cache will
# be updated and saved to disk to memorize the files. To preserve previous generations in
2018-03-08 02:20:56 +00:00
# the cache, this means that it also needs to get loaded from disk first.
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
if " r " in cache_mode : # r(echunk)
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " UNKNOWN: rechunking enforced " )
2018-02-23 13:48:24 +00:00
return False , None
2010-12-21 20:29:09 +00:00
entry = self . files . get ( path_hash )
2014-06-03 19:19:02 +00:00
if not entry :
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " UNKNOWN: no file metadata in cache for: %r " , hashed_path )
2018-02-23 13:48:24 +00:00
return False , None
# we know the file!
2016-04-16 15:48:47 +00:00
entry = FileCacheEntry ( * msgpack . unpackb ( entry ) )
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
if " s " in cache_mode and entry . size != st . st_size :
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " KNOWN-CHANGED: file size has changed: %r " , hashed_path )
2018-02-23 13:48:24 +00:00
return True , None
2018-03-08 03:10:43 +00:00
if " i " in cache_mode and entry . inode != st . st_ino :
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " KNOWN-CHANGED: file inode number has changed: %r " , hashed_path )
2018-02-23 13:48:24 +00:00
return True , None
2022-05-04 08:34:33 +00:00
if " c " in cache_mode and timestamp_to_int ( entry . cmtime ) != st . st_ctime_ns :
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " KNOWN-CHANGED: file ctime has changed: %r " , hashed_path )
2018-02-23 13:48:24 +00:00
return True , None
2022-05-04 08:34:33 +00:00
elif " m " in cache_mode and timestamp_to_int ( entry . cmtime ) != st . st_mtime_ns :
2021-02-23 21:56:38 +00:00
files_cache_logger . debug ( " KNOWN-CHANGED: file mtime has changed: %r " , hashed_path )
2018-02-23 13:48:24 +00:00
return True , None
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
# we ignored the inode number in the comparison above or it is still same.
# if it is still the same, replacing it in the tuple doesn't change it.
# if we ignored it, a reason for doing that is that files were moved to a new
# disk / new fs (so a one-time change of inode number is expected) and we wanted
# to avoid everything getting chunked again. to be able to re-enable the inode
# number comparison in a future backup run (and avoid chunking everything
# again at that time), we need to update the inode number in the cache with what
# we see in the filesystem.
self . files [ path_hash ] = msgpack . packb ( entry . _replace ( inode = st . st_ino , age = 0 ) )
2018-02-23 13:48:24 +00:00
return True , entry . chunk_ids
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
2021-02-23 21:56:38 +00:00
def memorize_file ( self , hashed_path , path_hash , st , ids ) :
if not stat . S_ISREG ( st . st_mode ) :
return
2018-03-08 02:20:56 +00:00
cache_mode = self . cache_mode
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
# note: r(echunk) modes will update the files cache, d(isabled) mode won't
2021-02-23 21:56:38 +00:00
if " d " in cache_mode :
files_cache_logger . debug ( " FILES-CACHE-NOUPDATE: files cache disabled " )
2015-03-08 14:01:24 +00:00
return
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
if " c " in cache_mode :
2021-02-23 21:56:38 +00:00
cmtime_type = " ctime "
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
cmtime_ns = safe_ns ( st . st_ctime_ns )
elif " m " in cache_mode :
2021-02-23 21:56:38 +00:00
cmtime_type = " mtime "
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
cmtime_ns = safe_ns ( st . st_mtime_ns )
2023-06-27 16:40:48 +00:00
else : # neither 'c' nor 'm' in cache_mode, avoid UnboundLocalError
cmtime_type = " ctime "
cmtime_ns = safe_ns ( st . st_ctime_ns )
2022-05-04 08:34:33 +00:00
entry = FileCacheEntry (
age = 0 , inode = st . st_ino , size = st . st_size , cmtime = int_to_timestamp ( cmtime_ns ) , chunk_ids = ids
)
2016-04-16 15:48:47 +00:00
self . files [ path_hash ] = msgpack . packb ( entry )
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-11 00:54:52 +00:00
self . _newest_cmtime = max ( self . _newest_cmtime or 0 , cmtime_ns )
2021-02-23 21:56:38 +00:00
files_cache_logger . debug (
" FILES-CACHE-UPDATE: put %r [has %s ] <- %r " ,
entry . _replace ( chunk_ids = " [ %d entries] " % len ( entry . chunk_ids ) ) ,
cmtime_type ,
hashed_path ,
)
2017-06-10 15:59:41 +00:00
class AdHocCache ( CacheStatsMixin ) :
"""
Ad - hoc , non - persistent cache .
Compared to the standard LocalCache the AdHocCache does not maintain accurate reference count ,
nor does it provide a files cache ( which would require persistence ) . Chunks that were not added
2022-06-10 20:39:07 +00:00
during the current AdHocCache lifetime won ' t have correct size set (0 bytes) and will
2017-06-10 15:59:41 +00:00
have an infinite reference count ( MAX_VALUE ) .
"""
str_format = """ \
All archives : unknown unknown unknown
Unique chunks Total chunks
Chunk index : { 0. total_unique_chunks : 20 d } unknown """
2023-01-31 20:05:12 +00:00
def __init__ ( self , manifest , warn_if_unencrypted = True , lock_wait = None , iec = False ) :
2021-03-20 23:33:31 +00:00
CacheStatsMixin . __init__ ( self , iec = iec )
2022-08-23 01:25:06 +00:00
assert isinstance ( manifest , Manifest )
2017-06-10 15:59:41 +00:00
self . manifest = manifest
2022-08-23 01:25:06 +00:00
self . repository = manifest . repository
self . key = manifest . key
self . repo_objs = manifest . repo_objs
2017-06-10 15:59:41 +00:00
self . _txn_active = False
2022-08-23 01:25:06 +00:00
self . security_manager = SecurityManager ( self . repository )
self . security_manager . assert_secure ( manifest , self . key , lock_wait = lock_wait )
2017-06-10 15:59:41 +00:00
logger . warning ( " Note: --no-cache-sync is an experimental feature. " )
# Public API
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
pass
2022-07-15 11:26:35 +00:00
files = None # type: ignore
2018-03-08 02:39:38 +00:00
cache_mode = " d "
2017-06-10 15:59:41 +00:00
2021-02-23 21:56:38 +00:00
def file_known_and_unchanged ( self , hashed_path , path_hash , st ) :
files_cache_logger . debug ( " UNKNOWN: files cache not implemented " )
2018-02-23 13:48:24 +00:00
return False , None
2017-06-10 15:59:41 +00:00
2021-02-23 21:56:38 +00:00
def memorize_file ( self , hashed_path , path_hash , st , ids ) :
2017-06-10 15:59:41 +00:00
pass
2023-09-15 20:19:29 +00:00
def add_chunk ( self , id , meta , data , * , stats , wait = True , compress = True , size = None , ro_type = ROBJ_FILE_STREAM ) :
assert ro_type is not None
2017-06-10 15:59:41 +00:00
if not self . _txn_active :
2017-07-30 19:50:57 +00:00
self . begin_txn ( )
2022-05-03 02:15:01 +00:00
if size is None and compress :
2022-09-05 00:53:28 +00:00
size = len ( data ) # data is still uncompressed
2022-05-03 02:15:01 +00:00
if size is None :
raise ValueError ( " when giving compressed data for a chunk, the uncompressed size must be given also " )
2017-06-10 15:59:41 +00:00
refcount = self . seen_chunk ( id , size )
if refcount :
2017-07-23 11:48:45 +00:00
return self . chunk_incref ( id , stats , size = size )
2023-09-15 20:19:29 +00:00
cdata = self . repo_objs . format ( id , meta , data , compress = compress , ro_type = ro_type )
2022-09-05 00:53:28 +00:00
self . repository . put ( id , cdata , wait = wait )
2022-06-10 20:23:27 +00:00
self . chunks . add ( id , 1 , size )
2022-06-11 20:29:43 +00:00
stats . update ( size , not refcount )
2022-06-10 18:36:58 +00:00
return ChunkListEntry ( id , size )
2017-06-10 15:59:41 +00:00
def seen_chunk ( self , id , size = None ) :
2017-06-11 09:29:41 +00:00
if not self . _txn_active :
2017-07-30 19:50:57 +00:00
self . begin_txn ( )
2022-06-10 20:23:27 +00:00
entry = self . chunks . get ( id , ChunkIndexEntry ( 0 , None ) )
2017-06-11 09:29:41 +00:00
if entry . refcount and size and not entry . size :
# The LocalCache has existing size information and uses *size* to make an effort at detecting collisions.
# This is of course not possible for the AdHocCache.
# Here *size* is used to update the chunk's size information, which will be zero for existing chunks.
self . chunks [ id ] = entry . _replace ( size = size )
return entry . refcount
2017-06-10 15:59:41 +00:00
2023-01-31 20:05:12 +00:00
def chunk_incref ( self , id , stats , size = None ) :
2017-06-10 15:59:41 +00:00
if not self . _txn_active :
2017-07-30 19:50:57 +00:00
self . begin_txn ( )
2022-06-10 20:23:27 +00:00
count , _size = self . chunks . incref ( id )
2017-07-23 11:48:45 +00:00
# When _size is 0 and size is not given, then this chunk has not been locally visited yet (seen_chunk with
2017-06-11 18:11:34 +00:00
# size or add_chunk); we can't add references to those (size=0 is invalid) and generally don't try to.
2017-07-23 11:51:35 +00:00
size = _size or size
assert size
2023-01-31 20:05:12 +00:00
stats . update ( size , False )
2022-06-10 18:36:58 +00:00
return ChunkListEntry ( id , size )
2017-06-10 15:59:41 +00:00
2023-01-31 20:05:12 +00:00
def chunk_decref ( self , id , stats , wait = True ) :
2017-06-10 15:59:41 +00:00
if not self . _txn_active :
2017-07-30 19:50:57 +00:00
self . begin_txn ( )
2022-06-10 20:23:27 +00:00
count , size = self . chunks . decref ( id )
2017-06-10 15:59:41 +00:00
if count == 0 :
del self . chunks [ id ]
self . repository . delete ( id , wait = wait )
2023-01-31 20:05:12 +00:00
stats . update ( - size , True )
2017-06-10 15:59:41 +00:00
else :
2023-01-31 20:05:12 +00:00
stats . update ( - size , False )
2017-06-10 15:59:41 +00:00
def commit ( self ) :
if not self . _txn_active :
return
self . security_manager . save ( self . manifest , self . key )
self . _txn_active = False
def rollback ( self ) :
self . _txn_active = False
del self . chunks
2017-07-30 19:50:57 +00:00
def begin_txn ( self ) :
2017-06-10 15:59:41 +00:00
self . _txn_active = True
2018-06-12 20:12:02 +00:00
# Explicitly set the initial usable hash table capacity to avoid performance issues
2017-06-10 15:59:41 +00:00
# due to hash table "resonance".
# Since we're creating an archive, add 10 % from the start.
num_chunks = len ( self . repository )
2018-06-12 20:12:02 +00:00
self . chunks = ChunkIndex ( usable = num_chunks * 1.1 )
2017-06-10 15:59:41 +00:00
pi = ProgressIndicatorPercent (
total = num_chunks , msg = " Downloading chunk list... %3.0f %% " , msgid = " cache.download_chunks "
)
t0 = perf_counter ( )
num_requests = 0
marker = None
while True :
result = self . repository . list ( limit = LIST_SCAN_LIMIT , marker = marker )
num_requests + = 1
if not result :
break
pi . show ( increase = len ( result ) )
marker = result [ - 1 ]
# All chunks from the repository have a refcount of MAX_VALUE, which is sticky,
# therefore we can't/won't delete them. Chunks we added ourselves in this transaction
# (e.g. checkpoint archives) are tracked correctly.
2022-06-10 20:23:27 +00:00
init_entry = ChunkIndexEntry ( refcount = ChunkIndex . MAX_VALUE , size = 0 )
2017-06-10 15:59:41 +00:00
for id_ in result :
self . chunks [ id_ ] = init_entry
assert len ( self . chunks ) == num_chunks
# LocalCache does not contain the manifest, either.
del self . chunks [ self . manifest . MANIFEST_ID ]
2017-06-18 11:32:12 +00:00
duration = perf_counter ( ) - t0 or 0.01
2017-06-10 15:59:41 +00:00
pi . finish ( )
logger . debug (
" AdHocCache: downloaded %d chunk IDs in %.2f s ( %d requests), ~ %s /s " ,
num_chunks ,
duration ,
num_requests ,
format_file_size ( num_chunks * 34 / duration ) ,
)
# Chunk IDs in a list are encoded in 34 bytes: 1 byte msgpack header, 1 byte length, 32 ID bytes.
# Protocol overhead is neglected in this calculation.