mirror of https://github.com/borgbackup/borg.git
Merge pull request #33 from ThomasWaldmann/quick-sync
speed up chunks cache sync, fixes #18
This commit is contained in:
commit
69371153d1
147
borg/cache.py
147
borg/cache.py
|
@ -1,10 +1,13 @@
|
|||
from configparser import RawConfigParser
|
||||
from .remote import cache_if_remote
|
||||
import errno
|
||||
import msgpack
|
||||
import os
|
||||
import sys
|
||||
from binascii import hexlify
|
||||
import shutil
|
||||
import tarfile
|
||||
import tempfile
|
||||
|
||||
from .key import PlaintextKey
|
||||
from .helpers import Error, get_cache_dir, decode_dict, st_mtime_ns, unhexlify, UpgradableLock, int_to_bigint, \
|
||||
|
@ -93,6 +96,8 @@ class Cache:
|
|||
with open(os.path.join(self.path, 'config'), 'w') as fd:
|
||||
config.write(fd)
|
||||
ChunkIndex().write(os.path.join(self.path, 'chunks').encode('utf-8'))
|
||||
with open(os.path.join(self.path, 'chunks.archive'), 'wb') as fd:
|
||||
pass # empty file
|
||||
with open(os.path.join(self.path, 'files'), 'wb') as fd:
|
||||
pass # empty file
|
||||
|
||||
|
@ -148,6 +153,7 @@ class Cache:
|
|||
os.mkdir(txn_dir)
|
||||
shutil.copy(os.path.join(self.path, 'config'), txn_dir)
|
||||
shutil.copy(os.path.join(self.path, 'chunks'), txn_dir)
|
||||
shutil.copy(os.path.join(self.path, 'chunks.archive'), txn_dir)
|
||||
shutil.copy(os.path.join(self.path, 'files'), txn_dir)
|
||||
os.rename(os.path.join(self.path, 'txn.tmp'),
|
||||
os.path.join(self.path, 'txn.active'))
|
||||
|
@ -189,6 +195,7 @@ class Cache:
|
|||
if os.path.exists(txn_dir):
|
||||
shutil.copy(os.path.join(txn_dir, 'config'), self.path)
|
||||
shutil.copy(os.path.join(txn_dir, 'chunks'), self.path)
|
||||
shutil.copy(os.path.join(txn_dir, 'chunks.archive'), self.path)
|
||||
shutil.copy(os.path.join(txn_dir, 'files'), self.path)
|
||||
os.rename(txn_dir, os.path.join(self.path, 'txn.tmp'))
|
||||
if os.path.exists(os.path.join(self.path, 'txn.tmp')):
|
||||
|
@ -197,37 +204,139 @@ class Cache:
|
|||
self._do_open()
|
||||
|
||||
def sync(self):
|
||||
"""Initializes cache by fetching and reading all archive indicies
|
||||
"""Re-synchronize chunks cache with repository.
|
||||
|
||||
If present, uses a compressed tar archive of known backup archive
|
||||
indices, so it only needs to fetch infos from repo and build a chunk
|
||||
index once per backup archive.
|
||||
If out of sync, the tar gets rebuilt from known + fetched chunk infos,
|
||||
so it has complete and current information about all backup archives.
|
||||
Finally, it builds the master chunks index by merging all indices from
|
||||
the tar.
|
||||
|
||||
Note: compression (esp. xz) is very effective in keeping the tar
|
||||
relatively small compared to the files it contains.
|
||||
"""
|
||||
def add(id, size, csize):
|
||||
in_archive_path = os.path.join(self.path, 'chunks.archive')
|
||||
out_archive_path = os.path.join(self.path, 'chunks.archive.tmp')
|
||||
|
||||
def open_in_archive():
|
||||
try:
|
||||
count, size, csize = self.chunks[id]
|
||||
self.chunks[id] = count + 1, size, csize
|
||||
tf = tarfile.open(in_archive_path, 'r')
|
||||
except OSError as e:
|
||||
if e.errno != errno.ENOENT:
|
||||
raise
|
||||
# file not found
|
||||
tf = None
|
||||
except tarfile.ReadError:
|
||||
# empty file?
|
||||
tf = None
|
||||
return tf
|
||||
|
||||
def open_out_archive():
|
||||
for compression in ('xz', 'bz2', 'gz'):
|
||||
# xz needs py 3.3, bz2 and gz also work on 3.2
|
||||
try:
|
||||
tf = tarfile.open(out_archive_path, 'w:'+compression, format=tarfile.PAX_FORMAT)
|
||||
break
|
||||
except tarfile.CompressionError:
|
||||
continue
|
||||
else: # shouldn't happen
|
||||
tf = None
|
||||
return tf
|
||||
|
||||
def close_archive(tf):
|
||||
if tf:
|
||||
tf.close()
|
||||
|
||||
def delete_in_archive():
|
||||
os.unlink(in_archive_path)
|
||||
|
||||
def rename_out_archive():
|
||||
os.rename(out_archive_path, in_archive_path)
|
||||
|
||||
def add(chunk_idx, id, size, csize, incr=1):
|
||||
try:
|
||||
count, size, csize = chunk_idx[id]
|
||||
chunk_idx[id] = count + incr, size, csize
|
||||
except KeyError:
|
||||
self.chunks[id] = 1, size, csize
|
||||
self.begin_txn()
|
||||
print('Initializing cache...')
|
||||
self.chunks.clear()
|
||||
unpacker = msgpack.Unpacker()
|
||||
repository = cache_if_remote(self.repository)
|
||||
for name, info in self.manifest.archives.items():
|
||||
archive_id = info[b'id']
|
||||
chunk_idx[id] = incr, size, csize
|
||||
|
||||
def transfer_known_idx(archive_id, tf_in, tf_out):
|
||||
archive_id_hex = hexlify(archive_id).decode('ascii')
|
||||
tarinfo = tf_in.getmember(archive_id_hex)
|
||||
archive_name = tarinfo.pax_headers['archive_name']
|
||||
print('Already known archive:', archive_name)
|
||||
f_in = tf_in.extractfile(archive_id_hex)
|
||||
tf_out.addfile(tarinfo, f_in)
|
||||
return archive_name
|
||||
|
||||
def fetch_and_build_idx(archive_id, repository, key, tmp_dir, tf_out):
|
||||
chunk_idx = ChunkIndex()
|
||||
cdata = repository.get(archive_id)
|
||||
data = self.key.decrypt(archive_id, cdata)
|
||||
add(archive_id, len(data), len(cdata))
|
||||
data = key.decrypt(archive_id, cdata)
|
||||
add(chunk_idx, archive_id, len(data), len(cdata))
|
||||
archive = msgpack.unpackb(data)
|
||||
if archive[b'version'] != 1:
|
||||
raise Exception('Unknown archive metadata version')
|
||||
decode_dict(archive, (b'name',))
|
||||
print('Analyzing archive:', archive[b'name'])
|
||||
for key, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])):
|
||||
data = self.key.decrypt(key, chunk)
|
||||
add(key, len(data), len(chunk))
|
||||
print('Analyzing new archive:', archive[b'name'])
|
||||
unpacker = msgpack.Unpacker()
|
||||
for item_id, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])):
|
||||
data = key.decrypt(item_id, chunk)
|
||||
add(chunk_idx, item_id, len(data), len(chunk))
|
||||
unpacker.feed(data)
|
||||
for item in unpacker:
|
||||
if b'chunks' in item:
|
||||
for chunk_id, size, csize in item[b'chunks']:
|
||||
add(chunk_id, size, csize)
|
||||
add(chunk_idx, chunk_id, size, csize)
|
||||
archive_id_hex = hexlify(archive_id).decode('ascii')
|
||||
file_tmp = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
|
||||
chunk_idx.write(file_tmp)
|
||||
tarinfo = tf_out.gettarinfo(file_tmp, archive_id_hex)
|
||||
tarinfo.pax_headers['archive_name'] = archive[b'name']
|
||||
with open(file_tmp, 'rb') as f:
|
||||
tf_out.addfile(tarinfo, f)
|
||||
os.unlink(file_tmp)
|
||||
|
||||
def create_master_idx(chunk_idx, tf_in, tmp_dir):
|
||||
chunk_idx.clear()
|
||||
for tarinfo in tf_in:
|
||||
archive_id_hex = tarinfo.name
|
||||
tf_in.extract(archive_id_hex, tmp_dir)
|
||||
chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
|
||||
archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
|
||||
for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
|
||||
add(chunk_idx, chunk_id, size, csize, incr=count)
|
||||
os.unlink(chunk_idx_path)
|
||||
|
||||
self.begin_txn()
|
||||
print('Synchronizing chunks cache...')
|
||||
# XXX we have to do stuff on disk due to lacking ChunkIndex api
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repository = cache_if_remote(self.repository)
|
||||
out_archive = open_out_archive()
|
||||
in_archive = open_in_archive()
|
||||
if in_archive:
|
||||
known_ids = set(unhexlify(hexid) for hexid in in_archive.getnames())
|
||||
else:
|
||||
known_ids = set()
|
||||
archive_ids = set(info[b'id'] for info in self.manifest.archives.values())
|
||||
print('Rebuilding archive collection. Known: %d Repo: %d Unknown: %d' % (
|
||||
len(known_ids), len(archive_ids), len(archive_ids - known_ids), ))
|
||||
for archive_id in archive_ids & known_ids:
|
||||
transfer_known_idx(archive_id, in_archive, out_archive)
|
||||
close_archive(in_archive)
|
||||
delete_in_archive() # free disk space
|
||||
for archive_id in archive_ids - known_ids:
|
||||
fetch_and_build_idx(archive_id, repository, self.key, tmp_dir, out_archive)
|
||||
close_archive(out_archive)
|
||||
rename_out_archive()
|
||||
print('Merging collection into master chunks cache...')
|
||||
in_archive = open_in_archive()
|
||||
create_master_idx(self.chunks, in_archive, tmp_dir)
|
||||
close_archive(in_archive)
|
||||
print('Done.')
|
||||
|
||||
def add_chunk(self, id, data, stats):
|
||||
if not self.txn_active:
|
||||
|
|
Loading…
Reference in New Issue