refcounting: use uint32_t, protect against overflows, fix merging for BE

This commit is contained in:
Marian Beermann 2016-04-12 00:10:44 +02:00
parent c90745cdbb
commit 29ebdbadae
No known key found for this signature in database
GPG Key ID: 9B8450B91D1362C1
5 changed files with 300 additions and 57 deletions

View File

@ -439,25 +439,3 @@ hashindex_get_size(HashIndex *index)
{
return index->num_entries;
}
static void
hashindex_add(HashIndex *index, const void *key, int32_t *other_values)
{
int32_t *my_values = (int32_t *)hashindex_get(index, key);
if(my_values == NULL) {
hashindex_set(index, key, other_values);
} else {
*my_values += *other_values;
}
}
static void
hashindex_merge(HashIndex *index, HashIndex *other)
{
int32_t key_size = index->key_size;
void *key = NULL;
while((key = hashindex_next_key(other, key))) {
hashindex_add(index, key, key + key_size);
}
}

View File

@ -743,8 +743,7 @@ class ArchiveChecker:
def add_reference(id_, size, csize, cdata=None):
try:
count, _, _ = self.chunks[id_]
self.chunks[id_] = count + 1, size, csize
self.chunks.incref(id_)
except KeyError:
assert cdata is not None
self.chunks[id_] = 1, size, csize

View File

@ -379,21 +379,19 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
def chunk_incref(self, id, stats):
if not self.txn_active:
self.begin_txn()
count, size, csize = self.chunks[id]
self.chunks[id] = (count + 1, size, csize)
count, size, csize = self.chunks.incref(id)
stats.update(size, csize, False)
return id, size, csize
def chunk_decref(self, id, stats):
if not self.txn_active:
self.begin_txn()
count, size, csize = self.chunks[id]
if count == 1:
count, size, csize = self.chunks.decref(id)
if count == 0:
del self.chunks[id]
self.repository.delete(id, wait=False)
stats.update(-size, -csize, True)
else:
self.chunks[id] = (count - 1, size, csize)
stats.update(-size, -csize, False)
def file_known_and_unchanged(self, path_hash, st, ignore_inode=False):

View File

@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
import os
cimport cython
from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
API_VERSION = 2
@ -19,13 +22,34 @@ cdef extern from "_hashindex.c":
void *hashindex_next_key(HashIndex *index, void *key)
int hashindex_delete(HashIndex *index, void *key)
int hashindex_set(HashIndex *index, void *key, void *value)
int _htole32(int v)
int _le32toh(int v)
uint32_t _htole32(uint32_t v)
uint32_t _le32toh(uint32_t v)
cdef _NoDefault = object()
cimport cython
"""
The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these
first bytes are used for in-band signalling in the data structure itself.
The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0
to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use
(byte order is LE)::
0xffffffff marks empty entries in the hashtable
0xfffffffe marks deleted entries in the hashtable
None of the publicly available classes in this module will accept nor return a reserved value;
AssertionError is raised instead.
"""
assert UINT32_MAX == 2**32-1
# module-level constant because cdef's in classes can't have default values
cdef uint32_t _MAX_VALUE = 2**32-1025
MAX_VALUE = _MAX_VALUE
assert _MAX_VALUE % 2 == 1
@cython.internal
cdef class IndexBase:
@ -98,22 +122,30 @@ cdef class NSIndex(IndexBase):
def __getitem__(self, key):
assert len(key) == self.key_size
data = <int *>hashindex_get(self.index, <char *>key)
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if not data:
raise KeyError
return _le32toh(data[0]), _le32toh(data[1])
raise KeyError(key)
cdef uint32_t segment = _le32toh(data[0])
assert segment <= _MAX_VALUE, "maximum number of segments reached"
return segment, _le32toh(data[1])
def __setitem__(self, key, value):
assert len(key) == self.key_size
cdef int[2] data
data[0] = _htole32(value[0])
cdef uint32_t[2] data
cdef uint32_t segment = value[0]
assert segment <= _MAX_VALUE, "maximum number of segments reached"
data[0] = _htole32(segment)
data[1] = _htole32(value[1])
if not hashindex_set(self.index, <char *>key, data):
raise Exception('hashindex_set failed')
def __contains__(self, key):
cdef uint32_t segment
assert len(key) == self.key_size
data = <int *>hashindex_get(self.index, <char *>key)
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if data != NULL:
segment = _le32toh(data[0])
assert segment <= _MAX_VALUE, "maximum number of segments reached"
return data != NULL
def iteritems(self, marker=None):
@ -146,25 +178,46 @@ cdef class NSKeyIterator:
self.key = hashindex_next_key(self.index, <char *>self.key)
if not self.key:
raise StopIteration
cdef int *value = <int *>(self.key + self.key_size)
return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]))
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
cdef uint32_t segment = _le32toh(value[0])
assert segment <= _MAX_VALUE, "maximum number of segments reached"
return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
cdef class ChunkIndex(IndexBase):
"""
Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned.
The reference count cannot overflow. If an overflow would occur, the refcount
is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
or add().
Prior signed 32-bit overflow is handled correctly for most cases: All values
from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
Other values are handled correctly. Note that previously the refcount could also reach
0 by *increasing* it.
Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
"""
value_size = 12
def __getitem__(self, key):
assert len(key) == self.key_size
data = <int *>hashindex_get(self.index, <char *>key)
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if not data:
raise KeyError
return _le32toh(data[0]), _le32toh(data[1]), _le32toh(data[2])
raise KeyError(key)
cdef uint32_t refcount = _le32toh(data[0])
assert refcount <= _MAX_VALUE
return refcount, _le32toh(data[1]), _le32toh(data[2])
def __setitem__(self, key, value):
assert len(key) == self.key_size
cdef int[3] data
data[0] = _htole32(value[0])
cdef uint32_t[3] data
cdef uint32_t refcount = value[0]
assert refcount <= _MAX_VALUE, "invalid reference count"
data[0] = _htole32(refcount)
data[1] = _htole32(value[1])
data[2] = _htole32(value[2])
if not hashindex_set(self.index, <char *>key, data):
@ -172,9 +225,38 @@ cdef class ChunkIndex(IndexBase):
def __contains__(self, key):
assert len(key) == self.key_size
data = <int *>hashindex_get(self.index, <char *>key)
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if data != NULL:
assert data[0] <= _MAX_VALUE
return data != NULL
def incref(self, key):
"""Increase refcount for 'key', return (refcount, size, csize)"""
assert len(key) == self.key_size
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if not data:
raise KeyError(key)
cdef uint32_t refcount = _le32toh(data[0])
assert refcount <= _MAX_VALUE, "invalid reference count"
if refcount != _MAX_VALUE:
refcount += 1
data[0] = _htole32(refcount)
return refcount, _le32toh(data[1]), _le32toh(data[2])
def decref(self, key):
"""Decrease refcount for 'key', return (refcount, size, csize)"""
assert len(key) == self.key_size
data = <uint32_t *>hashindex_get(self.index, <char *>key)
if not data:
raise KeyError(key)
cdef uint32_t refcount = _le32toh(data[0])
# Never decrease a reference count of zero
assert 0 < refcount <= _MAX_VALUE, "invalid reference count"
if refcount != _MAX_VALUE:
refcount -= 1
data[0] = _htole32(refcount)
return refcount, _le32toh(data[1]), _le32toh(data[2])
def iteritems(self, marker=None):
cdef const void *key
iter = ChunkKeyIterator(self.key_size)
@ -188,8 +270,9 @@ cdef class ChunkIndex(IndexBase):
return iter
def summarize(self):
cdef long long size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
cdef int *values
cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
cdef uint32_t *values
cdef uint32_t refcount
cdef void *key = NULL
while True:
@ -197,25 +280,46 @@ cdef class ChunkIndex(IndexBase):
if not key:
break
unique_chunks += 1
values = <int*> (key + self.key_size)
chunks += _le32toh(values[0])
values = <uint32_t*> (key + self.key_size)
refcount = _le32toh(values[0])
assert refcount <= MAX_VALUE, "invalid reference count"
chunks += refcount
unique_size += _le32toh(values[1])
unique_csize += _le32toh(values[2])
size += <long long> _le32toh(values[1]) * _le32toh(values[0])
csize += <long long> _le32toh(values[2]) * _le32toh(values[0])
size += <uint64_t> _le32toh(values[1]) * _le32toh(values[0])
csize += <uint64_t> _le32toh(values[2]) * _le32toh(values[0])
return size, csize, unique_size, unique_csize, unique_chunks, chunks
def add(self, key, refs, size, csize):
assert len(key) == self.key_size
cdef int[3] data
cdef uint32_t[3] data
data[0] = _htole32(refs)
data[1] = _htole32(size)
data[2] = _htole32(csize)
hashindex_add(self.index, <char *>key, data)
self._add(<char*> key, data)
cdef _add(self, void *key, uint32_t *data):
cdef uint64_t refcount1, refcount2, result64
values = <uint32_t*> hashindex_get(self.index, key)
if values:
refcount1 = _le32toh(values[0])
refcount2 = _le32toh(data[0])
assert refcount1 <= _MAX_VALUE
assert refcount2 <= _MAX_VALUE
result64 = refcount1 + refcount2
values[0] = _htole32(min(result64, _MAX_VALUE))
else:
hashindex_set(self.index, key, data)
def merge(self, ChunkIndex other):
hashindex_merge(self.index, other.index)
cdef void *key = NULL
while True:
key = hashindex_next_key(other.index, key)
if not key:
break
self._add(key, <uint32_t*> (key + self.key_size))
cdef class ChunkKeyIterator:
@ -235,5 +339,7 @@ cdef class ChunkKeyIterator:
self.key = hashindex_next_key(self.index, <char *>self.key)
if not self.key:
raise StopIteration
cdef int *value = <int *>(self.key + self.key_size)
return (<char *>self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]), _le32toh(value[2]))
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
cdef uint32_t refcount = _le32toh(value[0])
assert refcount <= MAX_VALUE, "invalid reference count"
return (<char *>self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2]))

View File

@ -1,8 +1,13 @@
import base64
import hashlib
import os
import struct
import tempfile
import zlib
import pytest
from ..hashindex import NSIndex, ChunkIndex
from .. import hashindex
from . import BaseTestCase
@ -114,3 +119,160 @@ class HashIndexTestCase(BaseTestCase):
assert unique_csize == 100 + 200 + 300
assert chunks == 1 + 2 + 3
assert unique_chunks == 3
class HashIndexRefcountingTestCase(BaseTestCase):
def test_chunkindex_limit(self):
idx = ChunkIndex()
idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2
# 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited
for i in range(5):
# first incref to move it to the limit
refcount, *_ = idx.incref(H(1))
assert refcount == hashindex.MAX_VALUE
for i in range(5):
refcount, *_ = idx.decref(H(1))
assert refcount == hashindex.MAX_VALUE
def _merge(self, refcounta, refcountb):
def merge(refcount1, refcount2):
idx1 = ChunkIndex()
idx1[H(1)] = refcount1, 1, 2
idx2 = ChunkIndex()
idx2[H(1)] = refcount2, 1, 2
idx1.merge(idx2)
refcount, *_ = idx1[H(1)]
return refcount
result = merge(refcounta, refcountb)
# check for commutativity
assert result == merge(refcountb, refcounta)
return result
def test_chunkindex_merge_limit1(self):
# Check that it does *not* limit at MAX_VALUE - 1
# (MAX_VALUE is odd)
half = hashindex.MAX_VALUE // 2
assert self._merge(half, half) == hashindex.MAX_VALUE - 1
def test_chunkindex_merge_limit2(self):
# 3000000000 + 2000000000 > MAX_VALUE
assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE
def test_chunkindex_merge_limit3(self):
# Crossover point: both addition and limit semantics will yield the same result
half = hashindex.MAX_VALUE // 2
assert self._merge(half + 1, half) == hashindex.MAX_VALUE
def test_chunkindex_merge_limit4(self):
# Beyond crossover, result of addition would be 2**31
half = hashindex.MAX_VALUE // 2
assert self._merge(half + 2, half) == hashindex.MAX_VALUE
assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE
def test_chunkindex_add(self):
idx1 = ChunkIndex()
idx1.add(H(1), 5, 6, 7)
assert idx1[H(1)] == (5, 6, 7)
idx1.add(H(1), 1, 0, 0)
assert idx1[H(1)] == (6, 6, 7)
def test_incref_limit(self):
idx1 = ChunkIndex()
idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7)
idx1.incref(H(1))
refcount, *_ = idx1[H(1)]
assert refcount == hashindex.MAX_VALUE
def test_decref_limit(self):
idx1 = ChunkIndex()
idx1[H(1)] = hashindex.MAX_VALUE, 6, 7
idx1.decref(H(1))
refcount, *_ = idx1[H(1)]
assert refcount == hashindex.MAX_VALUE
def test_decref_zero(self):
idx1 = ChunkIndex()
idx1[H(1)] = 0, 0, 0
with pytest.raises(AssertionError):
idx1.decref(H(1))
def test_incref_decref(self):
idx1 = ChunkIndex()
idx1.add(H(1), 5, 6, 7)
assert idx1[H(1)] == (5, 6, 7)
idx1.incref(H(1))
assert idx1[H(1)] == (6, 6, 7)
idx1.decref(H(1))
assert idx1[H(1)] == (5, 6, 7)
def test_setitem_raises(self):
idx1 = ChunkIndex()
with pytest.raises(AssertionError):
idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0
def test_keyerror(self):
idx = ChunkIndex()
with pytest.raises(KeyError):
idx.incref(H(1))
with pytest.raises(KeyError):
idx.decref(H(1))
with pytest.raises(KeyError):
idx[H(1)]
with pytest.raises(OverflowError):
idx.add(H(1), -1, 0, 0)
class HashIndexDataTestCase(BaseTestCase):
# This bytestring was created with 1.0-maint at c2f9533
HASHINDEX = b'eJzt0L0NgmAUhtHLT0LDEI6AuAEhMVYmVnSuYefC7AB3Aj9KNedJbnfyFne6P67P27w0EdG1Eac+Cm1ZybAsy7Isy7Isy7Isy7I' \
b'sy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7LsL9nhc+cqTZ' \
b'3XlO2Ys++Du5fX+l1/YFmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVn2/+0O2rYccw=='
def _serialize_hashindex(self, idx):
with tempfile.TemporaryDirectory() as tempdir:
file = os.path.join(tempdir, 'idx')
idx.write(file)
with open(file, 'rb') as f:
return self._pack(f.read())
def _deserialize_hashindex(self, bytestring):
with tempfile.TemporaryDirectory() as tempdir:
file = os.path.join(tempdir, 'idx')
with open(file, 'wb') as f:
f.write(self._unpack(bytestring))
return ChunkIndex.read(file)
def _pack(self, bytestring):
return base64.b64encode(zlib.compress(bytestring))
def _unpack(self, bytestring):
return zlib.decompress(base64.b64decode(bytestring))
def test_identical_creation(self):
idx1 = ChunkIndex()
idx1[H(1)] = 1, 2, 3
idx1[H(2)] = 2**31 - 1, 0, 0
idx1[H(3)] = 4294962296, 0, 0 # 4294962296 is -5000 interpreted as an uint32_t
assert self._serialize_hashindex(idx1) == self.HASHINDEX
def test_read_known_good(self):
idx1 = self._deserialize_hashindex(self.HASHINDEX)
assert idx1[H(1)] == (1, 2, 3)
assert idx1[H(2)] == (2**31 - 1, 0, 0)
assert idx1[H(3)] == (4294962296, 0, 0)
idx2 = ChunkIndex()
idx2[H(3)] = 2**32 - 123456, 6, 7
idx1.merge(idx2)
assert idx1[H(3)] == (hashindex.MAX_VALUE, 0, 0)
def test_nsindex_segment_limit():
idx = NSIndex()
with pytest.raises(AssertionError):
idx[H(1)] = hashindex.MAX_VALUE + 1, 0
assert H(1) not in idx
idx[H(2)] = hashindex.MAX_VALUE, 0
assert H(2) in idx