reimplement the chunk index merging in C

the python code could take a rather long time and likely most of it was converting stuff from python to C and back.
This commit is contained in:
Thomas Waldmann 2015-08-06 23:32:53 +02:00
parent 7e21d95ded
commit a1e039ba21
4 changed files with 46 additions and 2 deletions

View File

@ -385,3 +385,22 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
*total_unique_chunks = unique_chunks;
*total_chunks = chunks;
}
static void
hashindex_merge(HashIndex *index, HashIndex *other)
{
int32_t key_size = index->key_size;
const int32_t *other_values;
int32_t *my_values;
void *key = NULL;
while((key = hashindex_next_key(other, key))) {
other_values = key + key_size;
my_values = hashindex_get(index, key);
if(my_values == NULL) {
hashindex_set(index, key, other_values);
} else {
*my_values += *other_values;
}
}
}

View File

@ -309,8 +309,7 @@ class Cache:
tf_in.extract(archive_id_hex, tmp_dir)
chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
add(chunk_idx, chunk_id, size, csize, incr=count)
chunk_idx.merge(archive_chunk_idx)
os.unlink(chunk_idx_path)
self.begin_txn()

View File

@ -14,6 +14,7 @@ cdef extern from "_hashindex.c":
void hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
long long *unique_size, long long *unique_csize,
long long *total_unique_chunks, long long *total_chunks)
void hashindex_merge(HashIndex *index, HashIndex *other)
int hashindex_get_size(HashIndex *index)
int hashindex_write(HashIndex *index, char *path)
void *hashindex_get(HashIndex *index, void *key)
@ -190,6 +191,9 @@ cdef class ChunkIndex(IndexBase):
&total_unique_chunks, &total_chunks)
return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks
def merge(self, ChunkIndex other):
hashindex_merge(self.index, other.index)
cdef class ChunkKeyIterator:
cdef ChunkIndex idx

View File

@ -6,6 +6,11 @@ from ..hashindex import NSIndex, ChunkIndex
from . import BaseTestCase
def H(x):
# make some 32byte long thing that depends on x
return bytes('%-0.32d' % x, 'ascii')
class HashIndexTestCase(BaseTestCase):
def _generic_test(self, cls, make_value, sha):
@ -78,3 +83,20 @@ class HashIndexTestCase(BaseTestCase):
second_half = list(idx.iteritems(marker=all[49][0]))
self.assert_equal(len(second_half), 50)
self.assert_equal(second_half, all[50:])
def test_chunkindex_merge(self):
idx1 = ChunkIndex()
idx1[H(1)] = 1, 100, 100
idx1[H(2)] = 2, 200, 200
idx1[H(3)] = 3, 300, 300
# no H(4) entry
idx2 = ChunkIndex()
idx2[H(1)] = 4, 100, 100
idx2[H(2)] = 5, 200, 200
# no H(3) entry
idx2[H(4)] = 6, 400, 400
idx1.merge(idx2)
assert idx1[H(1)] == (5, 100, 100)
assert idx1[H(2)] == (7, 200, 200)
assert idx1[H(3)] == (3, 300, 300)
assert idx1[H(4)] == (6, 400, 400)