Merge pull request #33 from ThomasWaldmann/quick-sync

speed up chunks cache sync, fixes #18
2025-03-20 10:55:42 +00:00 · 2015-05-31 19:27:23 +02:00 · 2015-05-31 19:27:23 +02:00 · 69371153d1
commit 69371153d1
parent 95744a13dc a3f4d19515
1 changed files with 128 additions and 19 deletions
--- a/borg/cache.py
+++ b/borg/cache.py
@ -1,10 +1,13 @@
 from configparser import RawConfigParser
 from .remote import cache_if_remote
+import errno
 import msgpack
 import os
 import sys
 from binascii import hexlify
 import shutil
+import tarfile
+import tempfile

 from .key import PlaintextKey
 from .helpers import Error, get_cache_dir, decode_dict, st_mtime_ns, unhexlify, UpgradableLock, int_to_bigint, \
@ -93,6 +96,8 @@ class Cache:
        with open(os.path.join(self.path, 'config'), 'w') as fd:
            config.write(fd)
        ChunkIndex().write(os.path.join(self.path, 'chunks').encode('utf-8'))
+        with open(os.path.join(self.path, 'chunks.archive'), 'wb') as fd:
+            pass  # empty file
        with open(os.path.join(self.path, 'files'), 'wb') as fd:
            pass  # empty file

@ -148,6 +153,7 @@ class Cache:
        os.mkdir(txn_dir)
        shutil.copy(os.path.join(self.path, 'config'), txn_dir)
        shutil.copy(os.path.join(self.path, 'chunks'), txn_dir)
+        shutil.copy(os.path.join(self.path, 'chunks.archive'), txn_dir)
        shutil.copy(os.path.join(self.path, 'files'), txn_dir)
        os.rename(os.path.join(self.path, 'txn.tmp'),
                  os.path.join(self.path, 'txn.active'))
@ -189,6 +195,7 @@ class Cache:
        if os.path.exists(txn_dir):
            shutil.copy(os.path.join(txn_dir, 'config'), self.path)
            shutil.copy(os.path.join(txn_dir, 'chunks'), self.path)
+            shutil.copy(os.path.join(txn_dir, 'chunks.archive'), self.path)
            shutil.copy(os.path.join(txn_dir, 'files'), self.path)
            os.rename(txn_dir, os.path.join(self.path, 'txn.tmp'))
            if os.path.exists(os.path.join(self.path, 'txn.tmp')):
@ -197,37 +204,139 @@ class Cache:
        self._do_open()

    def sync(self):
-        """Initializes cache by fetching and reading all archive indicies
+        """Re-synchronize chunks cache with repository.
+
+        If present, uses a compressed tar archive of known backup archive
+        indices, so it only needs to fetch infos from repo and build a chunk
+        index once per backup archive.
+        If out of sync, the tar gets rebuilt from known + fetched chunk infos,
+        so it has complete and current information about all backup archives.
+        Finally, it builds the master chunks index by merging all indices from
+        the tar.
+
+        Note: compression (esp. xz) is very effective in keeping the tar
+              relatively small compared to the files it contains.
        """
-        def add(id, size, csize):
+        in_archive_path = os.path.join(self.path, 'chunks.archive')
+        out_archive_path = os.path.join(self.path, 'chunks.archive.tmp')
+
+        def open_in_archive():
            try:
-                count, size, csize = self.chunks[id]
-                self.chunks[id] = count + 1, size, csize
+                tf = tarfile.open(in_archive_path, 'r')
+            except OSError as e:
+                if e.errno != errno.ENOENT:
+                    raise
+                # file not found
+                tf = None
+            except tarfile.ReadError:
+                # empty file?
+                tf = None
+            return tf
+
+        def open_out_archive():
+            for compression in ('xz', 'bz2', 'gz'):
+                # xz needs py 3.3, bz2 and gz also work on 3.2
+                try:
+                    tf = tarfile.open(out_archive_path, 'w:'+compression, format=tarfile.PAX_FORMAT)
+                    break
+                except tarfile.CompressionError:
+                    continue
+            else:  # shouldn't happen
+                tf = None
+            return tf
+
+        def close_archive(tf):
+            if tf:
+                tf.close()
+
+        def delete_in_archive():
+            os.unlink(in_archive_path)
+
+        def rename_out_archive():
+            os.rename(out_archive_path, in_archive_path)
+
+        def add(chunk_idx, id, size, csize, incr=1):
+            try:
+                count, size, csize = chunk_idx[id]
+                chunk_idx[id] = count + incr, size, csize
            except KeyError:
-                self.chunks[id] = 1, size, csize
-        self.begin_txn()
-        print('Initializing cache...')
-        self.chunks.clear()
-        unpacker = msgpack.Unpacker()
-        repository = cache_if_remote(self.repository)
-        for name, info in self.manifest.archives.items():
-            archive_id = info[b'id']
+                chunk_idx[id] = incr, size, csize
+
+        def transfer_known_idx(archive_id, tf_in, tf_out):
+            archive_id_hex = hexlify(archive_id).decode('ascii')
+            tarinfo = tf_in.getmember(archive_id_hex)
+            archive_name = tarinfo.pax_headers['archive_name']
+            print('Already known archive:', archive_name)
+            f_in = tf_in.extractfile(archive_id_hex)
+            tf_out.addfile(tarinfo, f_in)
+            return archive_name
+
+        def fetch_and_build_idx(archive_id, repository, key, tmp_dir, tf_out):
+            chunk_idx = ChunkIndex()
            cdata = repository.get(archive_id)
-            data = self.key.decrypt(archive_id, cdata)
-            add(archive_id, len(data), len(cdata))
+            data = key.decrypt(archive_id, cdata)
+            add(chunk_idx, archive_id, len(data), len(cdata))
            archive = msgpack.unpackb(data)
            if archive[b'version'] != 1:
                raise Exception('Unknown archive metadata version')
            decode_dict(archive, (b'name',))
-            print('Analyzing archive:', archive[b'name'])
-            for key, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])):
-                data = self.key.decrypt(key, chunk)
-                add(key, len(data), len(chunk))
+            print('Analyzing new archive:', archive[b'name'])
+            unpacker = msgpack.Unpacker()
+            for item_id, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])):
+                data = key.decrypt(item_id, chunk)
+                add(chunk_idx, item_id, len(data), len(chunk))
                unpacker.feed(data)
                for item in unpacker:
                    if b'chunks' in item:
                        for chunk_id, size, csize in item[b'chunks']:
-                            add(chunk_id, size, csize)
+                            add(chunk_idx, chunk_id, size, csize)
+            archive_id_hex = hexlify(archive_id).decode('ascii')
+            file_tmp = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
+            chunk_idx.write(file_tmp)
+            tarinfo = tf_out.gettarinfo(file_tmp, archive_id_hex)
+            tarinfo.pax_headers['archive_name'] = archive[b'name']
+            with open(file_tmp, 'rb') as f:
+                tf_out.addfile(tarinfo, f)
+            os.unlink(file_tmp)
+
+        def create_master_idx(chunk_idx, tf_in, tmp_dir):
+            chunk_idx.clear()
+            for tarinfo in tf_in:
+                archive_id_hex = tarinfo.name
+                tf_in.extract(archive_id_hex, tmp_dir)
+                chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
+                archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
+                for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
+                    add(chunk_idx, chunk_id, size, csize, incr=count)
+                os.unlink(chunk_idx_path)
+
+        self.begin_txn()
+        print('Synchronizing chunks cache...')
+        # XXX we have to do stuff on disk due to lacking ChunkIndex api
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            repository = cache_if_remote(self.repository)
+            out_archive = open_out_archive()
+            in_archive = open_in_archive()
+            if in_archive:
+                known_ids = set(unhexlify(hexid) for hexid in in_archive.getnames())
+            else:
+                known_ids = set()
+            archive_ids = set(info[b'id'] for info in self.manifest.archives.values())
+            print('Rebuilding archive collection. Known: %d Repo: %d Unknown: %d' % (
+                len(known_ids), len(archive_ids), len(archive_ids - known_ids), ))
+            for archive_id in archive_ids & known_ids:
+                transfer_known_idx(archive_id, in_archive, out_archive)
+            close_archive(in_archive)
+            delete_in_archive()  # free disk space
+            for archive_id in archive_ids - known_ids:
+                fetch_and_build_idx(archive_id, repository, self.key, tmp_dir, out_archive)
+            close_archive(out_archive)
+            rename_out_archive()
+            print('Merging collection into master chunks cache...')
+            in_archive = open_in_archive()
+            create_master_idx(self.chunks, in_archive, tmp_dir)
+            close_archive(in_archive)
+            print('Done.')

    def add_chunk(self, id, data, stats):
        if not self.txn_active: