reimplement the chunk index merging in C

the python code could take a rather long time and likely most of it was converting stuff from python to C and back.
2015-08-06 23:32:53 +02:00 · 2015-08-06 23:32:53 +02:00 · a1e039ba21
parent 7e21d95ded
commit a1e039ba21
4 changed files with 46 additions and 2 deletions
--- a/borg/_hashindex.c
+++ b/borg/_hashindex.c
@ -385,3 +385,22 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
    *total_unique_chunks = unique_chunks;
    *total_chunks = chunks;
 }
+
+static void
+hashindex_merge(HashIndex *index, HashIndex *other)
+{
+    int32_t key_size = index->key_size;
+    const int32_t *other_values;
+    int32_t *my_values;
+    void *key = NULL;
+
+    while((key = hashindex_next_key(other, key))) {
+        other_values = key + key_size;
+        my_values = hashindex_get(index, key);
+        if(my_values == NULL) {
+            hashindex_set(index, key, other_values);
+        } else {
+            *my_values += *other_values;
+        }
+    }
+}
--- a/borg/cache.py
+++ b/borg/cache.py
@ -309,8 +309,7 @@ class Cache:
                tf_in.extract(archive_id_hex, tmp_dir)
                chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
                archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
-                for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
-                    add(chunk_idx, chunk_id, size, csize, incr=count)
+                chunk_idx.merge(archive_chunk_idx)
                os.unlink(chunk_idx_path)

        self.begin_txn()
--- a/borg/hashindex.pyx
+++ b/borg/hashindex.pyx
@ -14,6 +14,7 @@ cdef extern from "_hashindex.c":
    void hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
                             long long *unique_size, long long *unique_csize,
                             long long *total_unique_chunks, long long *total_chunks)
+    void hashindex_merge(HashIndex *index, HashIndex *other)
    int hashindex_get_size(HashIndex *index)
    int hashindex_write(HashIndex *index, char *path)
    void *hashindex_get(HashIndex *index, void *key)
@ -190,6 +191,9 @@ cdef class ChunkIndex(IndexBase):
                            &total_unique_chunks, &total_chunks)
        return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks

+    def merge(self, ChunkIndex other):
+        hashindex_merge(self.index, other.index)
+

 cdef class ChunkKeyIterator:
    cdef ChunkIndex idx
--- a/borg/testsuite/hashindex.py
+++ b/borg/testsuite/hashindex.py
@ -6,6 +6,11 @@ from ..hashindex import NSIndex, ChunkIndex
 from . import BaseTestCase


+def H(x):
+    # make some 32byte long thing that depends on x
+    return bytes('%-0.32d' % x, 'ascii')
+
+
 class HashIndexTestCase(BaseTestCase):

    def _generic_test(self, cls, make_value, sha):
@ -78,3 +83,20 @@ class HashIndexTestCase(BaseTestCase):
        second_half = list(idx.iteritems(marker=all[49][0]))
        self.assert_equal(len(second_half), 50)
        self.assert_equal(second_half, all[50:])
+
+    def test_chunkindex_merge(self):
+        idx1 = ChunkIndex()
+        idx1[H(1)] = 1, 100, 100
+        idx1[H(2)] = 2, 200, 200
+        idx1[H(3)] = 3, 300, 300
+        # no H(4) entry
+        idx2 = ChunkIndex()
+        idx2[H(1)] = 4, 100, 100
+        idx2[H(2)] = 5, 200, 200
+        # no H(3) entry
+        idx2[H(4)] = 6, 400, 400
+        idx1.merge(idx2)
+        assert idx1[H(1)] == (5, 100, 100)
+        assert idx1[H(2)] == (7, 200, 200)
+        assert idx1[H(3)] == (3, 300, 300)
+        assert idx1[H(4)] == (6, 400, 400)