2010-12-16 19:23:22 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-04-16 15:48:47 +00:00
|
|
|
from collections import namedtuple
|
2016-04-24 21:42:24 +00:00
|
|
|
import locale
|
2013-06-29 12:22:05 +00:00
|
|
|
import os
|
|
|
|
|
2016-04-11 22:10:44 +00:00
|
|
|
cimport cython
|
|
|
|
from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t
|
2016-05-29 17:52:53 +00:00
|
|
|
from libc.errno cimport errno
|
2017-07-02 19:45:34 +00:00
|
|
|
from libc.string cimport memcpy
|
2016-05-29 17:52:53 +00:00
|
|
|
from cpython.exc cimport PyErr_SetFromErrnoWithFilename
|
2017-06-10 07:56:41 +00:00
|
|
|
from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release
|
2017-07-02 19:45:34 +00:00
|
|
|
from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_CheckExact, PyBytes_GET_SIZE, PyBytes_AS_STRING
|
2016-04-11 22:10:44 +00:00
|
|
|
|
2019-02-24 14:42:21 +00:00
|
|
|
API_VERSION = '1.2_01'
|
2014-03-18 21:04:08 +00:00
|
|
|
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
cdef extern from "_hashindex.c":
|
2010-12-16 19:23:22 +00:00
|
|
|
ctypedef struct HashIndex:
|
|
|
|
pass
|
|
|
|
|
2017-07-02 19:45:34 +00:00
|
|
|
ctypedef struct FuseVersionsElement:
|
|
|
|
uint32_t version
|
|
|
|
char hash[16]
|
|
|
|
|
2017-05-27 19:50:28 +00:00
|
|
|
HashIndex *hashindex_read(object file_py, int permit_compact) except *
|
2014-07-10 13:32:12 +00:00
|
|
|
HashIndex *hashindex_init(int capacity, int key_size, int value_size)
|
|
|
|
void hashindex_free(HashIndex *index)
|
2016-07-14 00:08:15 +00:00
|
|
|
int hashindex_len(HashIndex *index)
|
|
|
|
int hashindex_size(HashIndex *index)
|
2017-05-09 19:30:14 +00:00
|
|
|
void hashindex_write(HashIndex *index, object file_py) except *
|
2018-10-24 19:40:05 +00:00
|
|
|
unsigned char *hashindex_get(HashIndex *index, unsigned char *key)
|
|
|
|
unsigned char *hashindex_next_key(HashIndex *index, unsigned char *key)
|
|
|
|
int hashindex_delete(HashIndex *index, unsigned char *key)
|
|
|
|
int hashindex_set(HashIndex *index, unsigned char *key, void *value)
|
2017-05-27 19:50:28 +00:00
|
|
|
uint64_t hashindex_compact(HashIndex *index)
|
2016-04-11 22:10:44 +00:00
|
|
|
uint32_t _htole32(uint32_t v)
|
|
|
|
uint32_t _le32toh(uint32_t v)
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2016-09-07 14:08:07 +00:00
|
|
|
double HASH_MAX_LOAD
|
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
|
2017-05-26 20:54:27 +00:00
|
|
|
cdef extern from "cache_sync/cache_sync.c":
|
2017-03-07 14:13:59 +00:00
|
|
|
ctypedef struct CacheSyncCtx:
|
|
|
|
pass
|
|
|
|
|
|
|
|
CacheSyncCtx *cache_sync_init(HashIndex *chunks)
|
2017-06-13 12:15:37 +00:00
|
|
|
const char *cache_sync_error(const CacheSyncCtx *ctx)
|
2019-02-04 02:26:45 +00:00
|
|
|
uint64_t cache_sync_num_files_totals(const CacheSyncCtx *ctx)
|
|
|
|
uint64_t cache_sync_num_files_parts(const CacheSyncCtx *ctx)
|
|
|
|
uint64_t cache_sync_size_totals(const CacheSyncCtx *ctx)
|
|
|
|
uint64_t cache_sync_size_parts(const CacheSyncCtx *ctx)
|
2017-03-07 14:13:59 +00:00
|
|
|
int cache_sync_feed(CacheSyncCtx *ctx, void *data, uint32_t length)
|
|
|
|
void cache_sync_free(CacheSyncCtx *ctx)
|
|
|
|
|
|
|
|
uint32_t _MAX_VALUE
|
|
|
|
|
|
|
|
|
2015-08-04 11:30:35 +00:00
|
|
|
cdef _NoDefault = object()
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2016-04-11 22:10:44 +00:00
|
|
|
"""
|
|
|
|
The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these
|
|
|
|
first bytes are used for in-band signalling in the data structure itself.
|
|
|
|
|
|
|
|
The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0
|
|
|
|
to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use
|
|
|
|
(byte order is LE)::
|
|
|
|
|
|
|
|
0xffffffff marks empty entries in the hashtable
|
|
|
|
0xfffffffe marks deleted entries in the hashtable
|
|
|
|
|
|
|
|
None of the publicly available classes in this module will accept nor return a reserved value;
|
|
|
|
AssertionError is raised instead.
|
|
|
|
"""
|
|
|
|
|
|
|
|
assert UINT32_MAX == 2**32-1
|
|
|
|
|
|
|
|
assert _MAX_VALUE % 2 == 1
|
2015-08-04 11:30:35 +00:00
|
|
|
|
2016-04-24 21:42:24 +00:00
|
|
|
|
2015-08-04 11:30:35 +00:00
|
|
|
@cython.internal
|
2010-12-16 19:23:22 +00:00
|
|
|
cdef class IndexBase:
|
|
|
|
cdef HashIndex *index
|
2015-08-16 12:51:15 +00:00
|
|
|
cdef int key_size
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2017-07-02 19:45:34 +00:00
|
|
|
_key_size = 32
|
|
|
|
|
2016-09-07 14:08:07 +00:00
|
|
|
MAX_LOAD_FACTOR = HASH_MAX_LOAD
|
2016-09-07 14:08:35 +00:00
|
|
|
MAX_VALUE = _MAX_VALUE
|
|
|
|
|
2018-06-12 20:12:02 +00:00
|
|
|
def __cinit__(self, capacity=0, path=None, permit_compact=False, usable=None):
|
2017-07-02 19:45:34 +00:00
|
|
|
self.key_size = self._key_size
|
2014-07-10 13:32:12 +00:00
|
|
|
if path:
|
2017-05-25 11:43:04 +00:00
|
|
|
if isinstance(path, (str, bytes)):
|
|
|
|
with open(path, 'rb') as fd:
|
2017-05-27 19:50:28 +00:00
|
|
|
self.index = hashindex_read(fd, permit_compact)
|
2017-05-25 11:43:04 +00:00
|
|
|
else:
|
2017-05-27 19:50:28 +00:00
|
|
|
self.index = hashindex_read(path, permit_compact)
|
2017-05-09 19:30:14 +00:00
|
|
|
assert self.index, 'hashindex_read() returned NULL with no exception set'
|
2014-07-10 13:32:12 +00:00
|
|
|
else:
|
2018-06-12 20:12:02 +00:00
|
|
|
if usable is not None:
|
|
|
|
capacity = int(usable / self.MAX_LOAD_FACTOR)
|
2014-07-10 13:32:12 +00:00
|
|
|
self.index = hashindex_init(capacity, self.key_size, self.value_size)
|
|
|
|
if not self.index:
|
|
|
|
raise Exception('hashindex_init failed')
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __dealloc__(self):
|
2013-06-03 11:45:48 +00:00
|
|
|
if self.index:
|
2014-07-10 13:32:12 +00:00
|
|
|
hashindex_free(self.index)
|
2013-07-03 10:19:16 +00:00
|
|
|
|
|
|
|
@classmethod
|
2017-05-27 19:50:28 +00:00
|
|
|
def read(cls, path, permit_compact=False):
|
|
|
|
return cls(path=path, permit_compact=permit_compact)
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2014-07-10 13:32:12 +00:00
|
|
|
def write(self, path):
|
2017-05-25 11:43:04 +00:00
|
|
|
if isinstance(path, (str, bytes)):
|
|
|
|
with open(path, 'wb') as fd:
|
|
|
|
hashindex_write(self.index, fd)
|
|
|
|
else:
|
|
|
|
hashindex_write(self.index, path)
|
2011-06-18 09:26:20 +00:00
|
|
|
|
2014-07-10 13:32:12 +00:00
|
|
|
def clear(self):
|
|
|
|
hashindex_free(self.index)
|
|
|
|
self.index = hashindex_init(0, self.key_size, self.value_size)
|
|
|
|
if not self.index:
|
|
|
|
raise Exception('hashindex_init failed')
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def setdefault(self, key, value):
|
|
|
|
if not key in self:
|
|
|
|
self[key] = value
|
2022-02-13 02:42:26 +00:00
|
|
|
return self[key]
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2013-07-03 10:19:16 +00:00
|
|
|
def __delitem__(self, key):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
rc = hashindex_delete(self.index, <unsigned char *>key)
|
2017-06-17 18:00:06 +00:00
|
|
|
if rc == 1:
|
|
|
|
return # success
|
|
|
|
if rc == -1:
|
|
|
|
raise KeyError(key)
|
|
|
|
if rc == 0:
|
2013-07-03 10:19:16 +00:00
|
|
|
raise Exception('hashindex_delete failed')
|
|
|
|
|
2010-12-21 20:29:09 +00:00
|
|
|
def get(self, key, default=None):
|
|
|
|
try:
|
|
|
|
return self[key]
|
|
|
|
except KeyError:
|
|
|
|
return default
|
|
|
|
|
2011-06-18 09:26:20 +00:00
|
|
|
def pop(self, key, default=_NoDefault):
|
|
|
|
try:
|
|
|
|
value = self[key]
|
|
|
|
del self[key]
|
|
|
|
return value
|
|
|
|
except KeyError:
|
|
|
|
if default != _NoDefault:
|
|
|
|
return default
|
|
|
|
raise
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __len__(self):
|
2016-07-14 00:08:15 +00:00
|
|
|
return hashindex_len(self.index)
|
|
|
|
|
|
|
|
def size(self):
|
|
|
|
"""Return size (bytes) of hash table."""
|
|
|
|
return hashindex_size(self.index)
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2017-05-27 19:50:28 +00:00
|
|
|
def compact(self):
|
|
|
|
return hashindex_compact(self.index)
|
|
|
|
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2017-07-02 19:45:34 +00:00
|
|
|
cdef class FuseVersionsIndex(IndexBase):
|
|
|
|
# 4 byte version + 16 byte file contents hash
|
|
|
|
value_size = 20
|
|
|
|
_key_size = 16
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
cdef FuseVersionsElement *data
|
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <FuseVersionsElement *>hashindex_get(self.index, <unsigned char *>key)
|
2017-07-02 19:45:34 +00:00
|
|
|
if data == NULL:
|
|
|
|
raise KeyError(key)
|
|
|
|
return _le32toh(data.version), PyBytes_FromStringAndSize(data.hash, 16)
|
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
|
|
|
cdef FuseVersionsElement data
|
|
|
|
assert len(key) == self.key_size
|
|
|
|
data.version = value[0]
|
|
|
|
assert data.version <= _MAX_VALUE, "maximum number of versions reached"
|
|
|
|
if not PyBytes_CheckExact(value[1]) or PyBytes_GET_SIZE(value[1]) != 16:
|
|
|
|
raise TypeError("Expected bytes of length 16 for second value")
|
|
|
|
memcpy(data.hash, PyBytes_AS_STRING(value[1]), 16)
|
|
|
|
data.version = _htole32(data.version)
|
2018-10-24 19:40:05 +00:00
|
|
|
if not hashindex_set(self.index, <unsigned char *>key, <void *> &data):
|
2017-07-02 19:45:34 +00:00
|
|
|
raise Exception('hashindex_set failed')
|
|
|
|
|
|
|
|
def __contains__(self, key):
|
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
return hashindex_get(self.index, <unsigned char *>key) != NULL
|
2017-07-02 19:45:34 +00:00
|
|
|
|
|
|
|
|
2010-12-16 19:23:22 +00:00
|
|
|
cdef class NSIndex(IndexBase):
|
|
|
|
|
2013-07-03 10:19:16 +00:00
|
|
|
value_size = 8
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __getitem__(self, key):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2010-12-16 19:23:22 +00:00
|
|
|
if not data:
|
2016-04-11 22:10:44 +00:00
|
|
|
raise KeyError(key)
|
|
|
|
cdef uint32_t segment = _le32toh(data[0])
|
|
|
|
assert segment <= _MAX_VALUE, "maximum number of segments reached"
|
|
|
|
return segment, _le32toh(data[1])
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t[2] data
|
|
|
|
cdef uint32_t segment = value[0]
|
|
|
|
assert segment <= _MAX_VALUE, "maximum number of segments reached"
|
|
|
|
data[0] = _htole32(segment)
|
2014-01-29 20:34:21 +00:00
|
|
|
data[1] = _htole32(value[1])
|
2018-10-24 19:40:05 +00:00
|
|
|
if not hashindex_set(self.index, <unsigned char *>key, data):
|
2013-07-03 10:19:16 +00:00
|
|
|
raise Exception('hashindex_set failed')
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __contains__(self, key):
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t segment
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2016-04-11 22:10:44 +00:00
|
|
|
if data != NULL:
|
|
|
|
segment = _le32toh(data[0])
|
|
|
|
assert segment <= _MAX_VALUE, "maximum number of segments reached"
|
2010-12-16 19:23:22 +00:00
|
|
|
return data != NULL
|
|
|
|
|
2014-02-09 21:05:33 +00:00
|
|
|
def iteritems(self, marker=None):
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef const unsigned char *key
|
2015-08-16 12:51:15 +00:00
|
|
|
iter = NSKeyIterator(self.key_size)
|
2014-02-04 22:49:10 +00:00
|
|
|
iter.idx = self
|
2010-12-16 19:23:22 +00:00
|
|
|
iter.index = self.index
|
2014-02-09 21:05:33 +00:00
|
|
|
if marker:
|
2018-10-24 19:40:05 +00:00
|
|
|
key = hashindex_get(self.index, <unsigned char *>marker)
|
2014-02-09 21:05:33 +00:00
|
|
|
if marker is None:
|
|
|
|
raise IndexError
|
2015-08-16 12:51:15 +00:00
|
|
|
iter.key = key - self.key_size
|
2010-12-16 19:23:22 +00:00
|
|
|
return iter
|
|
|
|
|
|
|
|
|
|
|
|
cdef class NSKeyIterator:
|
2014-02-04 22:49:10 +00:00
|
|
|
cdef NSIndex idx
|
2010-12-16 19:23:22 +00:00
|
|
|
cdef HashIndex *index
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef const unsigned char *key
|
2015-08-16 12:51:15 +00:00
|
|
|
cdef int key_size
|
2016-09-29 02:41:59 +00:00
|
|
|
cdef int exhausted
|
2010-12-16 19:23:22 +00:00
|
|
|
|
2015-08-16 12:51:15 +00:00
|
|
|
def __cinit__(self, key_size):
|
2010-12-16 19:23:22 +00:00
|
|
|
self.key = NULL
|
2015-08-16 12:51:15 +00:00
|
|
|
self.key_size = key_size
|
2016-09-29 02:41:59 +00:00
|
|
|
self.exhausted = 0
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
2016-09-29 02:41:59 +00:00
|
|
|
if self.exhausted:
|
|
|
|
raise StopIteration
|
2018-10-24 19:40:05 +00:00
|
|
|
self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
|
2010-12-16 19:23:22 +00:00
|
|
|
if not self.key:
|
2016-09-29 02:41:59 +00:00
|
|
|
self.exhausted = 1
|
2010-12-16 19:23:22 +00:00
|
|
|
raise StopIteration
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
|
|
|
|
cdef uint32_t segment = _le32toh(value[0])
|
|
|
|
assert segment <= _MAX_VALUE, "maximum number of segments reached"
|
|
|
|
return (<char *>self.key)[:self.key_size], (segment, _le32toh(value[1]))
|
2010-12-16 19:23:22 +00:00
|
|
|
|
|
|
|
|
2016-04-16 15:48:47 +00:00
|
|
|
ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'refcount size csize')
|
|
|
|
|
|
|
|
|
2011-07-30 19:13:48 +00:00
|
|
|
cdef class ChunkIndex(IndexBase):
|
2016-04-11 22:10:44 +00:00
|
|
|
"""
|
|
|
|
Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned.
|
|
|
|
|
|
|
|
The reference count cannot overflow. If an overflow would occur, the refcount
|
|
|
|
is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref()
|
|
|
|
or add().
|
|
|
|
|
|
|
|
Prior signed 32-bit overflow is handled correctly for most cases: All values
|
|
|
|
from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either
|
|
|
|
cause silent data loss (-1, -2) or will raise an AssertionError when accessed.
|
|
|
|
Other values are handled correctly. Note that previously the refcount could also reach
|
|
|
|
0 by *increasing* it.
|
|
|
|
|
|
|
|
Assigning refcounts in this reserved range is an invalid operation and raises AssertionError.
|
|
|
|
"""
|
2011-07-30 19:13:48 +00:00
|
|
|
|
2013-07-03 10:19:16 +00:00
|
|
|
value_size = 12
|
2011-07-30 19:13:48 +00:00
|
|
|
|
|
|
|
def __getitem__(self, key):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2011-07-30 19:13:48 +00:00
|
|
|
if not data:
|
2016-04-11 22:10:44 +00:00
|
|
|
raise KeyError(key)
|
|
|
|
cdef uint32_t refcount = _le32toh(data[0])
|
2017-02-20 06:38:55 +00:00
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
2016-04-16 15:48:47 +00:00
|
|
|
return ChunkIndexEntry(refcount, _le32toh(data[1]), _le32toh(data[2]))
|
2011-07-30 19:13:48 +00:00
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t[3] data
|
|
|
|
cdef uint32_t refcount = value[0]
|
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
|
|
|
data[0] = _htole32(refcount)
|
2014-01-29 20:34:21 +00:00
|
|
|
data[1] = _htole32(value[1])
|
|
|
|
data[2] = _htole32(value[2])
|
2018-10-24 19:40:05 +00:00
|
|
|
if not hashindex_set(self.index, <unsigned char *>key, data):
|
2013-07-03 10:19:16 +00:00
|
|
|
raise Exception('hashindex_set failed')
|
2011-07-30 19:13:48 +00:00
|
|
|
|
|
|
|
def __contains__(self, key):
|
2015-08-16 12:51:15 +00:00
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2016-04-11 22:10:44 +00:00
|
|
|
if data != NULL:
|
2017-02-20 06:38:55 +00:00
|
|
|
assert _le32toh(data[0]) <= _MAX_VALUE, "invalid reference count"
|
2011-07-30 19:13:48 +00:00
|
|
|
return data != NULL
|
|
|
|
|
2016-04-11 22:10:44 +00:00
|
|
|
def incref(self, key):
|
|
|
|
"""Increase refcount for 'key', return (refcount, size, csize)"""
|
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2016-04-11 22:10:44 +00:00
|
|
|
if not data:
|
|
|
|
raise KeyError(key)
|
|
|
|
cdef uint32_t refcount = _le32toh(data[0])
|
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
|
|
|
if refcount != _MAX_VALUE:
|
|
|
|
refcount += 1
|
|
|
|
data[0] = _htole32(refcount)
|
|
|
|
return refcount, _le32toh(data[1]), _le32toh(data[2])
|
|
|
|
|
|
|
|
def decref(self, key):
|
|
|
|
"""Decrease refcount for 'key', return (refcount, size, csize)"""
|
|
|
|
assert len(key) == self.key_size
|
2018-10-24 19:40:05 +00:00
|
|
|
data = <uint32_t *>hashindex_get(self.index, <unsigned char *>key)
|
2016-04-11 22:10:44 +00:00
|
|
|
if not data:
|
|
|
|
raise KeyError(key)
|
|
|
|
cdef uint32_t refcount = _le32toh(data[0])
|
|
|
|
# Never decrease a reference count of zero
|
|
|
|
assert 0 < refcount <= _MAX_VALUE, "invalid reference count"
|
|
|
|
if refcount != _MAX_VALUE:
|
|
|
|
refcount -= 1
|
|
|
|
data[0] = _htole32(refcount)
|
|
|
|
return refcount, _le32toh(data[1]), _le32toh(data[2])
|
|
|
|
|
2014-02-09 21:05:33 +00:00
|
|
|
def iteritems(self, marker=None):
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef const unsigned char *key
|
2015-08-16 12:51:15 +00:00
|
|
|
iter = ChunkKeyIterator(self.key_size)
|
2014-02-04 22:49:10 +00:00
|
|
|
iter.idx = self
|
2011-07-30 19:13:48 +00:00
|
|
|
iter.index = self.index
|
2014-02-09 21:05:33 +00:00
|
|
|
if marker:
|
2018-10-24 19:40:05 +00:00
|
|
|
key = hashindex_get(self.index, <unsigned char *>marker)
|
2014-02-09 21:05:33 +00:00
|
|
|
if marker is None:
|
|
|
|
raise IndexError
|
2015-08-16 12:51:15 +00:00
|
|
|
iter.key = key - self.key_size
|
2011-07-30 19:13:48 +00:00
|
|
|
return iter
|
|
|
|
|
2014-03-18 20:42:03 +00:00
|
|
|
def summarize(self):
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
|
|
|
|
cdef uint32_t *values
|
|
|
|
cdef uint32_t refcount
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef unsigned char *key = NULL
|
2016-04-11 21:48:01 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
key = hashindex_next_key(self.index, key)
|
|
|
|
if not key:
|
|
|
|
break
|
|
|
|
unique_chunks += 1
|
2016-04-11 22:10:44 +00:00
|
|
|
values = <uint32_t*> (key + self.key_size)
|
|
|
|
refcount = _le32toh(values[0])
|
2016-09-07 14:08:35 +00:00
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
2016-04-11 22:10:44 +00:00
|
|
|
chunks += refcount
|
2016-04-11 21:48:01 +00:00
|
|
|
unique_size += _le32toh(values[1])
|
2016-04-11 22:10:44 +00:00
|
|
|
size += <uint64_t> _le32toh(values[1]) * _le32toh(values[0])
|
2016-04-11 21:48:01 +00:00
|
|
|
|
|
|
|
return size, csize, unique_size, unique_csize, unique_chunks, chunks
|
2014-03-18 20:42:03 +00:00
|
|
|
|
2017-06-13 12:15:37 +00:00
|
|
|
def stats_against(self, ChunkIndex master_index):
|
|
|
|
"""
|
|
|
|
Calculate chunk statistics of this index against *master_index*.
|
|
|
|
|
|
|
|
A chunk is counted as unique if the number of references
|
|
|
|
in this index matches the number of references in *master_index*.
|
|
|
|
|
|
|
|
This index must be a subset of *master_index*.
|
|
|
|
|
|
|
|
Return the same statistics tuple as summarize:
|
|
|
|
size, csize, unique_size, unique_csize, unique_chunks, chunks.
|
|
|
|
"""
|
|
|
|
cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0
|
|
|
|
cdef uint32_t our_refcount, chunk_size, chunk_csize
|
|
|
|
cdef const uint32_t *our_values
|
|
|
|
cdef const uint32_t *master_values
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef const unsigned char *key = NULL
|
2017-06-13 12:15:37 +00:00
|
|
|
cdef HashIndex *master = master_index.index
|
|
|
|
|
|
|
|
while True:
|
|
|
|
key = hashindex_next_key(self.index, key)
|
|
|
|
if not key:
|
|
|
|
break
|
|
|
|
our_values = <const uint32_t*> (key + self.key_size)
|
|
|
|
master_values = <const uint32_t*> hashindex_get(master, key)
|
|
|
|
if not master_values:
|
|
|
|
raise ValueError('stats_against: key contained in self but not in master_index.')
|
|
|
|
our_refcount = _le32toh(our_values[0])
|
|
|
|
chunk_size = _le32toh(master_values[1])
|
|
|
|
chunk_csize = _le32toh(master_values[2])
|
|
|
|
|
|
|
|
chunks += our_refcount
|
|
|
|
size += <uint64_t> chunk_size * our_refcount
|
|
|
|
csize += <uint64_t> chunk_csize * our_refcount
|
|
|
|
if our_values[0] == master_values[0]:
|
|
|
|
# our refcount equals the master's refcount, so this chunk is unique to us
|
|
|
|
unique_chunks += 1
|
|
|
|
unique_size += chunk_size
|
|
|
|
unique_csize += chunk_csize
|
|
|
|
|
|
|
|
return size, csize, unique_size, unique_csize, unique_chunks, chunks
|
|
|
|
|
2015-12-07 18:13:58 +00:00
|
|
|
def add(self, key, refs, size, csize):
|
|
|
|
assert len(key) == self.key_size
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t[3] data
|
2015-12-07 18:13:58 +00:00
|
|
|
data[0] = _htole32(refs)
|
|
|
|
data[1] = _htole32(size)
|
|
|
|
data[2] = _htole32(csize)
|
2018-10-24 19:40:05 +00:00
|
|
|
self._add(<unsigned char*> key, data)
|
2016-04-11 22:10:44 +00:00
|
|
|
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef _add(self, unsigned char *key, uint32_t *data):
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint64_t refcount1, refcount2, result64
|
|
|
|
values = <uint32_t*> hashindex_get(self.index, key)
|
|
|
|
if values:
|
|
|
|
refcount1 = _le32toh(values[0])
|
|
|
|
refcount2 = _le32toh(data[0])
|
2017-02-20 06:38:55 +00:00
|
|
|
assert refcount1 <= _MAX_VALUE, "invalid reference count"
|
|
|
|
assert refcount2 <= _MAX_VALUE, "invalid reference count"
|
2016-04-11 22:10:44 +00:00
|
|
|
result64 = refcount1 + refcount2
|
|
|
|
values[0] = _htole32(min(result64, _MAX_VALUE))
|
2016-04-16 22:37:40 +00:00
|
|
|
values[1] = data[1]
|
|
|
|
values[2] = data[2]
|
2016-04-11 22:10:44 +00:00
|
|
|
else:
|
2016-07-08 23:26:24 +00:00
|
|
|
if not hashindex_set(self.index, key, data):
|
|
|
|
raise Exception('hashindex_set failed')
|
2015-12-07 18:13:58 +00:00
|
|
|
|
2015-08-06 21:32:53 +00:00
|
|
|
def merge(self, ChunkIndex other):
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef unsigned char *key = NULL
|
2016-04-11 22:10:44 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
key = hashindex_next_key(other.index, key)
|
|
|
|
if not key:
|
|
|
|
break
|
|
|
|
self._add(key, <uint32_t*> (key + self.key_size))
|
2015-08-06 21:32:53 +00:00
|
|
|
|
2017-06-10 15:59:41 +00:00
|
|
|
def zero_csize_ids(self):
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef unsigned char *key = NULL
|
2017-06-10 15:59:41 +00:00
|
|
|
cdef uint32_t *values
|
|
|
|
entries = []
|
|
|
|
while True:
|
|
|
|
key = hashindex_next_key(self.index, key)
|
|
|
|
if not key:
|
|
|
|
break
|
|
|
|
values = <uint32_t*> (key + self.key_size)
|
|
|
|
refcount = _le32toh(values[0])
|
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
|
|
|
if _le32toh(values[2]) == 0:
|
|
|
|
# csize == 0
|
|
|
|
entries.append(PyBytes_FromStringAndSize(<char*> key, self.key_size))
|
|
|
|
return entries
|
|
|
|
|
2011-07-30 19:13:48 +00:00
|
|
|
|
|
|
|
cdef class ChunkKeyIterator:
|
2014-02-04 22:49:10 +00:00
|
|
|
cdef ChunkIndex idx
|
2011-07-30 19:13:48 +00:00
|
|
|
cdef HashIndex *index
|
2018-10-24 19:40:05 +00:00
|
|
|
cdef const unsigned char *key
|
2015-08-16 12:51:15 +00:00
|
|
|
cdef int key_size
|
2016-09-29 02:41:59 +00:00
|
|
|
cdef int exhausted
|
2011-07-30 19:13:48 +00:00
|
|
|
|
2015-08-16 12:51:15 +00:00
|
|
|
def __cinit__(self, key_size):
|
2011-07-30 19:13:48 +00:00
|
|
|
self.key = NULL
|
2015-08-16 12:51:15 +00:00
|
|
|
self.key_size = key_size
|
2016-09-29 02:41:59 +00:00
|
|
|
self.exhausted = 0
|
2011-07-30 19:13:48 +00:00
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
2016-09-29 02:41:59 +00:00
|
|
|
if self.exhausted:
|
|
|
|
raise StopIteration
|
2018-10-24 19:40:05 +00:00
|
|
|
self.key = hashindex_next_key(self.index, <unsigned char *>self.key)
|
2011-07-30 19:13:48 +00:00
|
|
|
if not self.key:
|
2016-09-29 02:41:59 +00:00
|
|
|
self.exhausted = 1
|
2011-07-30 19:13:48 +00:00
|
|
|
raise StopIteration
|
2016-04-11 22:10:44 +00:00
|
|
|
cdef uint32_t *value = <uint32_t *>(self.key + self.key_size)
|
|
|
|
cdef uint32_t refcount = _le32toh(value[0])
|
2016-09-07 14:08:35 +00:00
|
|
|
assert refcount <= _MAX_VALUE, "invalid reference count"
|
2016-04-16 15:48:47 +00:00
|
|
|
return (<char *>self.key)[:self.key_size], ChunkIndexEntry(refcount, _le32toh(value[1]), _le32toh(value[2]))
|
2017-03-07 14:13:59 +00:00
|
|
|
|
|
|
|
|
2017-06-10 07:56:41 +00:00
|
|
|
cdef Py_buffer ro_buffer(object data) except *:
|
|
|
|
cdef Py_buffer view
|
|
|
|
PyObject_GetBuffer(data, &view, PyBUF_SIMPLE)
|
|
|
|
return view
|
|
|
|
|
|
|
|
|
2017-03-07 14:13:59 +00:00
|
|
|
cdef class CacheSynchronizer:
|
|
|
|
cdef ChunkIndex chunks
|
|
|
|
cdef CacheSyncCtx *sync
|
|
|
|
|
|
|
|
def __cinit__(self, chunks):
|
|
|
|
self.chunks = chunks
|
|
|
|
self.sync = cache_sync_init(self.chunks.index)
|
|
|
|
if not self.sync:
|
|
|
|
raise Exception('cache_sync_init failed')
|
|
|
|
|
|
|
|
def __dealloc__(self):
|
|
|
|
if self.sync:
|
|
|
|
cache_sync_free(self.sync)
|
|
|
|
|
|
|
|
def feed(self, chunk):
|
2017-06-10 07:56:41 +00:00
|
|
|
cdef Py_buffer chunk_buf = ro_buffer(chunk)
|
|
|
|
cdef int rc
|
|
|
|
rc = cache_sync_feed(self.sync, chunk_buf.buf, chunk_buf.len)
|
|
|
|
PyBuffer_Release(&chunk_buf)
|
|
|
|
if not rc:
|
2017-03-07 14:13:59 +00:00
|
|
|
error = cache_sync_error(self.sync)
|
2017-05-26 20:54:27 +00:00
|
|
|
if error != NULL:
|
|
|
|
raise ValueError('cache_sync_feed failed: ' + error.decode('ascii'))
|
2017-06-13 12:15:37 +00:00
|
|
|
|
|
|
|
@property
|
2019-02-04 02:26:45 +00:00
|
|
|
def num_files_totals(self):
|
|
|
|
return cache_sync_num_files_totals(self.sync)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def num_files_parts(self):
|
|
|
|
return cache_sync_num_files_parts(self.sync)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def size_totals(self):
|
|
|
|
return cache_sync_size_totals(self.sync)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def size_parts(self):
|
|
|
|
return cache_sync_size_parts(self.sync)
|