hashindex_add C implementation

this was also the loop contents of hashindex_merge, but we also need it callable from Cython/Python code.

this saves some cycles, esp. if the key is already present in the index.
This commit is contained in:
Thomas Waldmann 2015-12-07 19:13:58 +01:00
parent 2e5baa6ec2
commit 720fc49498
3 changed files with 24 additions and 19 deletions

View File

@ -390,21 +390,24 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
*total_chunks = chunks;
}
static void
hashindex_add(HashIndex *index, const void *key, int32_t *other_values)
{
int32_t *my_values = (int32_t *)hashindex_get(index, key);
if(my_values == NULL) {
hashindex_set(index, key, other_values);
} else {
*my_values += *other_values;
}
}
static void
hashindex_merge(HashIndex *index, HashIndex *other)
{
int32_t key_size = index->key_size;
const int32_t *other_values;
int32_t *my_values;
void *key = NULL;
while((key = hashindex_next_key(other, key))) {
other_values = key + key_size;
my_values = (int32_t *)hashindex_get(index, key);
if(my_values == NULL) {
hashindex_set(index, key, other_values);
} else {
*my_values += *other_values;
}
hashindex_add(index, key, key + key_size);
}
}

View File

@ -255,18 +255,11 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
for id in ids:
os.unlink(mkpath(id))
def add(chunk_idx, id, size, csize, incr=1):
try:
count, size, csize = chunk_idx[id]
chunk_idx[id] = count + incr, size, csize
except KeyError:
chunk_idx[id] = incr, size, csize
def fetch_and_build_idx(archive_id, repository, key):
chunk_idx = ChunkIndex()
cdata = repository.get(archive_id)
data = key.decrypt(archive_id, cdata)
add(chunk_idx, archive_id, len(data), len(cdata))
chunk_idx.add(archive_id, 1, len(data), len(cdata))
archive = msgpack.unpackb(data)
if archive[b'version'] != 1:
raise Exception('Unknown archive metadata version')
@ -274,7 +267,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
unpacker = msgpack.Unpacker()
for item_id, chunk in zip(archive[b'items'], repository.get_many(archive[b'items'])):
data = key.decrypt(item_id, chunk)
add(chunk_idx, item_id, len(data), len(chunk))
chunk_idx.add(item_id, 1, len(data), len(chunk))
unpacker.feed(data)
for item in unpacker:
if not isinstance(item, dict):
@ -282,7 +275,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
continue
if b'chunks' in item:
for chunk_id, size, csize in item[b'chunks']:
add(chunk_idx, chunk_id, size, csize)
chunk_idx.add(chunk_id, 1, size, csize)
if self.do_cache:
fn = mkpath(archive_id)
fn_tmp = mkpath(archive_id, suffix='.tmp')

View File

@ -15,6 +15,7 @@ cdef extern from "_hashindex.c":
long long *unique_size, long long *unique_csize,
long long *total_unique_chunks, long long *total_chunks)
void hashindex_merge(HashIndex *index, HashIndex *other)
void hashindex_add(HashIndex *index, void *key, void *value)
int hashindex_get_size(HashIndex *index)
int hashindex_write(HashIndex *index, char *path)
void *hashindex_get(HashIndex *index, void *key)
@ -196,6 +197,14 @@ cdef class ChunkIndex(IndexBase):
&total_unique_chunks, &total_chunks)
return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks
def add(self, key, refs, size, csize):
assert len(key) == self.key_size
cdef int[3] data
data[0] = _htole32(refs)
data[1] = _htole32(size)
data[2] = _htole32(csize)
hashindex_add(self.index, <char *>key, data)
def merge(self, ChunkIndex other):
hashindex_merge(self.index, other.index)