mirror of
https://github.com/borgbackup/borg.git
synced 2025-02-20 21:27:32 +00:00
cache sync/remote: compressed, decrypted cache
This commit is contained in:
parent
c786a5941e
commit
835b0e5ee0
2 changed files with 70 additions and 29 deletions
|
@ -564,17 +564,15 @@ def cleanup_cached_archive(id):
|
|||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def fetch_and_build_idx(archive_id, repository, key, chunk_idx):
|
||||
cdata = repository.get(archive_id)
|
||||
data = key.decrypt(archive_id, cdata)
|
||||
chunk_idx.add(archive_id, 1, len(data), len(cdata))
|
||||
def fetch_and_build_idx(archive_id, decrypted_repository, key, chunk_idx):
|
||||
csize, data = decrypted_repository.get(archive_id)
|
||||
chunk_idx.add(archive_id, 1, len(data), csize)
|
||||
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
||||
if archive.version != 1:
|
||||
raise Exception('Unknown archive metadata version')
|
||||
sync = CacheSynchronizer(chunk_idx)
|
||||
for item_id, chunk in zip(archive.items, repository.get_many(archive.items)):
|
||||
data = key.decrypt(item_id, chunk)
|
||||
chunk_idx.add(item_id, 1, len(data), len(chunk))
|
||||
for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
|
||||
chunk_idx.add(item_id, 1, len(data), csize)
|
||||
sync.feed(data)
|
||||
if self.do_cache:
|
||||
fn = mkpath(archive_id)
|
||||
|
@ -641,7 +639,7 @@ def create_master_idx(chunk_idx):
|
|||
# above can remove *archive_id* from *cached_ids*.
|
||||
logger.info('Fetching and building archive index for %s ...', archive_name)
|
||||
archive_chunk_idx = ChunkIndex()
|
||||
fetch_and_build_idx(archive_id, repository, self.key, archive_chunk_idx)
|
||||
fetch_and_build_idx(archive_id, decrypted_repository, self.key, archive_chunk_idx)
|
||||
logger.info("Merging into master chunks index ...")
|
||||
if chunk_idx is None:
|
||||
# we just use the first archive's idx as starting point,
|
||||
|
@ -653,7 +651,7 @@ def create_master_idx(chunk_idx):
|
|||
else:
|
||||
chunk_idx = chunk_idx or ChunkIndex(master_index_capacity)
|
||||
logger.info('Fetching archive index for %s ...', archive_name)
|
||||
fetch_and_build_idx(archive_id, repository, self.key, chunk_idx)
|
||||
fetch_and_build_idx(archive_id, decrypted_repository, self.key, chunk_idx)
|
||||
if self.progress:
|
||||
pi.finish()
|
||||
logger.info('Done.')
|
||||
|
@ -675,7 +673,7 @@ def legacy_cleanup():
|
|||
pass
|
||||
|
||||
self.begin_txn()
|
||||
with cache_if_remote(self.repository) as repository:
|
||||
with cache_if_remote(self.repository, decrypted_cache=self.key) as decrypted_repository:
|
||||
legacy_cleanup()
|
||||
# TEMPORARY HACK: to avoid archive index caching, create a FILE named ~/.cache/borg/REPOID/chunks.archive.d -
|
||||
# this is only recommended if you have a fast, low latency connection to your repo (e.g. if repo is local disk)
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
import select
|
||||
import shlex
|
||||
import shutil
|
||||
import struct
|
||||
import sys
|
||||
import tempfile
|
||||
import textwrap
|
||||
|
@ -18,6 +19,7 @@
|
|||
import msgpack
|
||||
|
||||
from . import __version__
|
||||
from .compress import LZ4
|
||||
from .helpers import Error, IntegrityError
|
||||
from .helpers import bin_to_hex
|
||||
from .helpers import get_home_dir
|
||||
|
@ -1046,9 +1048,14 @@ class RepositoryNoCache:
|
|||
"""A not caching Repository wrapper, passes through to repository.
|
||||
|
||||
Just to have same API (including the context manager) as RepositoryCache.
|
||||
|
||||
*transform* is a callable taking two arguments, key and raw repository data.
|
||||
The return value is returned from get()/get_many(). By default, the raw
|
||||
repository data is returned.
|
||||
"""
|
||||
def __init__(self, repository):
|
||||
def __init__(self, repository, transform=None):
|
||||
self.repository = repository
|
||||
self.transform = transform or (lambda key, data: data)
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
@ -1063,8 +1070,8 @@ def get(self, key):
|
|||
return next(self.get_many([key], cache=False))
|
||||
|
||||
def get_many(self, keys, cache=True):
|
||||
for data in self.repository.get_many(keys):
|
||||
yield data
|
||||
for key, data in zip(keys, self.repository.get_many(keys)):
|
||||
yield self.transform(key, data)
|
||||
|
||||
|
||||
class RepositoryCache(RepositoryNoCache):
|
||||
|
@ -1072,10 +1079,17 @@ class RepositoryCache(RepositoryNoCache):
|
|||
A caching Repository wrapper.
|
||||
|
||||
Caches Repository GET operations locally.
|
||||
|
||||
*pack* and *unpack* complement *transform* of the base class.
|
||||
*pack* receives the output of *transform* and should return bytes,
|
||||
which are stored in the cache. *unpack* receives these bytes and
|
||||
should return the initial data (as returned by *transform*).
|
||||
"""
|
||||
|
||||
def __init__(self, repository):
|
||||
super().__init__(repository)
|
||||
def __init__(self, repository, pack=None, unpack=None, transform=None):
|
||||
super().__init__(repository, transform)
|
||||
self.pack = pack or (lambda data: data)
|
||||
self.unpack = unpack or (lambda data: data)
|
||||
self.cache = set()
|
||||
self.basedir = tempfile.mkdtemp(prefix='borg-cache-')
|
||||
self.query_size_limit()
|
||||
|
@ -1106,11 +1120,15 @@ def backoff(self):
|
|||
os.unlink(file)
|
||||
self.evictions += 1
|
||||
|
||||
def add_entry(self, key, data):
|
||||
def add_entry(self, key, data, cache):
|
||||
transformed = self.transform(key, data)
|
||||
if not cache:
|
||||
return transformed
|
||||
packed = self.pack(transformed)
|
||||
file = self.key_filename(key)
|
||||
try:
|
||||
with open(file, 'wb') as fd:
|
||||
fd.write(data)
|
||||
fd.write(packed)
|
||||
except OSError as os_error:
|
||||
if os_error.errno == errno.ENOSPC:
|
||||
self.enospc += 1
|
||||
|
@ -1118,11 +1136,11 @@ def add_entry(self, key, data):
|
|||
else:
|
||||
raise
|
||||
else:
|
||||
self.size += len(data)
|
||||
self.size += len(packed)
|
||||
self.cache.add(key)
|
||||
if self.size > self.size_limit:
|
||||
self.backoff()
|
||||
return data
|
||||
return transformed
|
||||
|
||||
def close(self):
|
||||
logger.debug('RepositoryCache: current items %d, size %s / %s, %d hits, %d misses, %d slow misses (+%.1fs), '
|
||||
|
@ -1141,31 +1159,56 @@ def get_many(self, keys, cache=True):
|
|||
file = self.key_filename(key)
|
||||
with open(file, 'rb') as fd:
|
||||
self.hits += 1
|
||||
yield fd.read()
|
||||
yield self.unpack(fd.read())
|
||||
else:
|
||||
for key_, data in repository_iterator:
|
||||
if key_ == key:
|
||||
if cache:
|
||||
self.add_entry(key, data)
|
||||
transformed = self.add_entry(key, data, cache)
|
||||
self.misses += 1
|
||||
yield data
|
||||
yield transformed
|
||||
break
|
||||
else:
|
||||
# slow path: eviction during this get_many removed this key from the cache
|
||||
t0 = time.perf_counter()
|
||||
data = self.repository.get(key)
|
||||
self.slow_lat += time.perf_counter() - t0
|
||||
if cache:
|
||||
self.add_entry(key, data)
|
||||
transformed = self.add_entry(key, data, cache)
|
||||
self.slow_misses += 1
|
||||
yield data
|
||||
yield transformed
|
||||
# Consume any pending requests
|
||||
for _ in repository_iterator:
|
||||
pass
|
||||
|
||||
|
||||
def cache_if_remote(repository):
|
||||
def cache_if_remote(repository, *, decrypted_cache=False, pack=None, unpack=None, transform=None):
|
||||
"""
|
||||
Return a Repository(No)Cache for *repository*.
|
||||
|
||||
If *decrypted_cache* is a key object, then get and get_many will return a tuple
|
||||
(csize, plaintext) instead of the actual data in the repository. The cache will
|
||||
store decrypted data, which increases CPU efficiency (by avoiding repeatedly decrypting
|
||||
and more importantly MAC and ID checking cached objects).
|
||||
Internally, objects are compressed with LZ4.
|
||||
"""
|
||||
if decrypted_cache and (pack or unpack or transform):
|
||||
raise ValueError('decrypted_cache and pack/unpack/transform are incompatible')
|
||||
elif decrypted_cache:
|
||||
key = decrypted_cache
|
||||
cache_struct = struct.Struct('=I')
|
||||
compressor = LZ4()
|
||||
|
||||
def pack(data):
|
||||
return cache_struct.pack(data[0]) + compressor.compress(data[1])
|
||||
|
||||
def unpack(data):
|
||||
return cache_struct.unpack(data[:cache_struct.size])[0], compressor.decompress(data[cache_struct.size:])
|
||||
|
||||
def transform(id_, data):
|
||||
csize = len(data)
|
||||
decrypted = key.decrypt(id_, data)
|
||||
return csize, decrypted
|
||||
|
||||
if isinstance(repository, RemoteRepository):
|
||||
return RepositoryCache(repository)
|
||||
return RepositoryCache(repository, pack, unpack, transform)
|
||||
else:
|
||||
return RepositoryNoCache(repository)
|
||||
return RepositoryNoCache(repository, transform)
|
||||
|
|
Loading…
Reference in a new issue