Use Cython for all native code

This commit is contained in:
Jonas Borgström 2013-05-28 14:35:55 +02:00
parent b994203c01
commit 3e5433855c
10 changed files with 137 additions and 196 deletions

1
.gitignore vendored
View File

@ -3,6 +3,7 @@ build
dist
env
hashindex.c
chunker.c
*.egg-info
*.pyc
*.pyo

View File

@ -1,6 +1,4 @@
#include <Python.h>
#include <structmember.h>
#include <stdint.h>
/* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
@ -44,7 +42,7 @@ static uint32_t table_base[] =
static uint32_t *
init_buzhash_table(uint32_t seed)
buzhash_init_table(uint32_t seed)
{
int i;
uint32_t *table = malloc(1024);
@ -56,9 +54,9 @@ init_buzhash_table(uint32_t seed)
}
static uint32_t
buzhash(const unsigned char *data, int len, const uint32_t *h)
buzhash(const unsigned char *data, size_t len, const uint32_t *h)
{
int i;
size_t i;
uint32_t sum = 0;
for(i = len - 1; i > 0; i--)
{
@ -69,63 +67,72 @@ buzhash(const unsigned char *data, int len, const uint32_t *h)
}
static uint32_t
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
{
return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
}
typedef struct {
PyObject_HEAD
int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
size_t bytes_read, bytes_yielded;
uint32_t *h;
PyObject *chunks, *fd;
unsigned char *data;
} ChunkifyIter;
int window_size, chunk_mask, min_size;
size_t buf_size;
uint32_t *table;
uint8_t *data;
PyObject *fd;
int done;
size_t remaining, bytes_read, bytes_yielded, position, last;
} Chunker;
static PyObject*
ChunkifyIter_iter(PyObject *self)
static Chunker *
chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
{
ChunkifyIter *c = (ChunkifyIter *)self;
c->remaining = 0;
c->position = 0;
Chunker *c = malloc(sizeof(Chunker));
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->buf_size = 10 * 1024 * 1024;
c->data = malloc(c->buf_size);
c->fd = fd;
Py_INCREF(fd);
c->done = 0;
c->last = 0;
c->remaining = 0;
c->bytes_read = 0;
c->bytes_yielded = 0;
Py_INCREF(self);
return self;
c->position = 0;
c->last = 0;
return c;
}
static void
ChunkifyIter_dealloc(PyObject *self)
chunker_free(Chunker *c)
{
ChunkifyIter *c = (ChunkifyIter *)self;
Py_DECREF(c->fd);
free(c->table);
free(c->data);
free(c->h);
self->ob_type->tp_free(self);
free(c);
}
static void
ChunkifyIter_fill(PyObject *self)
static int
chunker_fill(Chunker *c)
{
ChunkifyIter *c = (ChunkifyIter *)self;
memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
c->position -= c->last;
c->last = 0;
PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->position - c->remaining);
if(!data) {
return 0;
}
int n = PyString_Size(data);
memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
c->remaining += n;
c->bytes_read += n;
Py_DECREF(data);
return 1;
}
static PyObject*
ChunkifyIter_iternext(PyObject *self)
static PyObject *
chunker_process(Chunker *c)
{
ChunkifyIter *c = (ChunkifyIter *)self;
uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
int n = 0;
@ -137,7 +144,9 @@ ChunkifyIter_iternext(PyObject *self)
return NULL;
}
if(c->remaining <= window_size) {
ChunkifyIter_fill(self);
if(!chunker_fill(c)) {
return NULL;
}
}
if(c->remaining < window_size) {
c->done = 1;
@ -153,16 +162,18 @@ ChunkifyIter_iternext(PyObject *self)
return NULL;
}
}
sum = buzhash(c->data + c->position, window_size, c->h);
sum = buzhash(c->data + c->position, window_size, c->table);
while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
sum = buzhash_update(sum, c->data[c->position],
c->data[c->position + window_size],
window_size, c->h);
window_size, c->table);
c->position++;
c->remaining--;
n++;
if(c->remaining <= window_size) {
ChunkifyIter_fill(self);
if(!chunker_fill(c)) {
return NULL;
}
}
}
if(c->remaining <= window_size) {
@ -174,117 +185,5 @@ ChunkifyIter_iternext(PyObject *self)
n = c->last - old_last;
c->bytes_yielded += n;
return PyBuffer_FromMemory(c->data + old_last, n);
}
static PyTypeObject ChunkifyIterType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"_chunkifier._ChunkifyIter", /*tp_name*/
sizeof(ChunkifyIter), /*tp_basicsize*/
0, /*tp_itemsize*/
ChunkifyIter_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
/* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
use tp_iter and tp_iternext fields. */
"", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
ChunkifyIter_iter, /* tp_iter: __iter__() method */
ChunkifyIter_iternext /* tp_iternext: next() method */
};
static PyObject *
chunkify(PyObject *self, PyObject *args)
{
PyObject *fd;
int seed, window_size, chunk_mask, min_size;
ChunkifyIter *c;
if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
{
return NULL;
}
if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType)))
{
return NULL;
}
PyObject_Init((PyObject *)c, &ChunkifyIterType);
c->buf_size = 10 * 1024 * 1024;
c->data = malloc(c->buf_size);
c->h = init_buzhash_table(seed & 0xffffffff);
c->fd = fd;
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
Py_INCREF(fd);
return (PyObject *)c;
}
static PyObject *
do_buzhash(PyObject *self, PyObject *args)
{
unsigned char *data;
int len;
unsigned long int seed, sum;
uint32_t *h;
if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
{
return NULL;
}
h = init_buzhash_table(seed & 0xffffffff);
sum = buzhash(data, len, h);
free(h);
return PyLong_FromUnsignedLong(sum);
}
static PyObject *
do_buzhash_update(PyObject *self, PyObject *args)
{
unsigned long int sum, seed;
unsigned char remove, add;
uint32_t *h;
int len;
if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
{
return NULL;
}
h = init_buzhash_table(seed & 0xffffffff);
sum = buzhash_update(sum, remove, add, len, h);
free(h);
return PyLong_FromUnsignedLong(sum);
}
static PyMethodDef ChunkifierMethods[] = {
{"chunkify", chunkify, METH_VARARGS, ""},
{"buzhash", do_buzhash, METH_VARARGS, ""},
{"buzhash_update", do_buzhash_update, METH_VARARGS, ""},
{NULL, NULL, 0, NULL} /* Sentinel */
};
PyMODINIT_FUNC
init_speedups(void)
{
ChunkifyIterType.tp_new = PyType_GenericNew;
if (PyType_Ready(&ChunkifyIterType) < 0) return;
Py_InitModule("_speedups", ChunkifierMethods);
}
}

View File

@ -9,8 +9,6 @@
#include <unistd.h>
#include <sys/mman.h>
#include "hashindex.h"
typedef struct {
char magic[8];
int32_t num_entries;
@ -19,6 +17,18 @@ typedef struct {
int8_t value_size;
} __attribute__((__packed__)) HashHeader;
typedef struct {
char *path;
void *map_addr;
off_t map_length;
void *buckets;
int num_entries;
int num_buckets;
int key_size;
int value_size;
int bucket_size;
int limit;
} HashIndex;
#define MAGIC "DARCHASH"
#define EMPTY ((int32_t)-1)
@ -33,6 +43,16 @@ typedef struct {
#define BUCKET_MARK_DELETED(index, idx) (*((int32_t *)(BUCKET_ADDR_WRITE(index, idx) + index->key_size)) = DELETED)
static HashIndex *hashindex_open(const char *path);
static void hashindex_close(HashIndex *index);
static void hashindex_clear(HashIndex *index);
static void hashindex_flush(HashIndex *index);
static HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
static const void *hashindex_get(HashIndex *index, const void *key);
static void hashindex_set(HashIndex *index, const void *key, const void *value);
static void hashindex_delete(HashIndex *index, const void *key);
static void *hashindex_next_key(HashIndex *index, const void *key);
/* Private API */
static int
@ -97,7 +117,7 @@ hashindex_resize(HashIndex *index, int capacity)
}
/* Public API */
HashIndex *
static HashIndex *
hashindex_open(const char *path)
{
int fd = open(path, O_RDWR);
@ -127,7 +147,7 @@ hashindex_open(const char *path)
return index;
}
HashIndex *
static HashIndex *
hashindex_create(const char *path, int capacity, int key_size, int value_size)
{
FILE *fd;
@ -160,7 +180,7 @@ error:
return NULL;
}
void
static void
hashindex_clear(HashIndex *index)
{
int i;
@ -171,7 +191,7 @@ hashindex_clear(HashIndex *index)
hashindex_resize(index, 16);
}
void
static void
hashindex_flush(HashIndex *index)
{
*((int32_t *)(index->map_addr + 8)) = index->num_entries;
@ -179,7 +199,7 @@ hashindex_flush(HashIndex *index)
msync(index->map_addr, index->map_length, MS_SYNC);
}
void
static void
hashindex_close(HashIndex *index)
{
hashindex_flush(index);
@ -188,7 +208,7 @@ hashindex_close(HashIndex *index)
free(index);
}
const void *
static const void *
hashindex_get(HashIndex *index, const void *key)
{
int idx = hashindex_lookup(index, key);
@ -198,7 +218,7 @@ hashindex_get(HashIndex *index, const void *key)
return BUCKET_ADDR_READ(index, idx) + index->key_size;
}
void
static void
hashindex_set(HashIndex *index, const void *key, const void *value)
{
int idx = hashindex_lookup(index, key);
@ -223,7 +243,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
}
}
void
static void
hashindex_delete(HashIndex *index, const void *key)
{
int idx = hashindex_lookup(index, key);
@ -234,7 +254,7 @@ hashindex_delete(HashIndex *index, const void *key)
index->num_entries -= 1;
}
void *
static void *
hashindex_next_key(HashIndex *index, const void *key)
{
int idx = 0;
@ -251,7 +271,7 @@ hashindex_next_key(HashIndex *index, const void *key)
return BUCKET_ADDR_READ(index, idx);
}
int
static int
hashindex_get_size(HashIndex *index)
{
return index->num_entries;

View File

@ -11,7 +11,7 @@ import time
from cStringIO import StringIO
from xattr import xattr, XATTR_NOFOLLOW
from ._speedups import chunkify
from .chunker import chunkify
from .helpers import uid2user, user2uid, gid2group, group2gid, \
encode_filename, Statistics

48
darc/chunker.pyx Normal file
View File

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from libc.stdlib cimport free
cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct Chunker:
pass
Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
void chunker_free(Chunker *chunker)
object chunker_process(Chunker *chunker)
uint32_t *buzhash_init_table(uint32_t seed)
uint32_t c_buzhash "buzhash"(const unsigned char *data, size_t len, const uint32_t *h)
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
cdef class chunkify:
cdef Chunker *chunker
def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
def __dealloc__(self):
if self.chunker:
chunker_free(self.chunker)
def __iter__(self):
return self
def __next__(self):
return chunker_process(self.chunker)
def buzhash(unsigned char *data, unsigned long seed):
cdef uint32_t *table
cdef uint32_t sum
table = buzhash_init_table(seed & 0xffffffff)
sum = c_buzhash(data, len(data), table)
free(table)
return sum
def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
cdef uint32_t *table
table = buzhash_init_table(seed & 0xffffffff)
sum = c_buzhash_update(sum, remove, add, len, table)
free(table)
return sum

View File

@ -1,28 +0,0 @@
#ifndef __HASHINDEX_H__
#define __HASHINDEX_H__
typedef struct {
char *path;
void *map_addr;
off_t map_length;
void *buckets;
int num_entries;
int num_buckets;
int key_size;
int value_size;
int bucket_size;
int limit;
} HashIndex;
HashIndex *hashindex_open(const char *path);
void hashindex_close(HashIndex *index);
void hashindex_clear(HashIndex *index);
void hashindex_flush(HashIndex *index);
HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
const void *hashindex_get(HashIndex *index, const void *key);
void hashindex_set(HashIndex *index, const void *key, const void *value);
void hashindex_delete(HashIndex *index, const void *key);
void *hashindex_next_key(HashIndex *index, const void *key);
int hashindex_get_size(HashIndex *index);
#endif

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
cdef extern from "hashindex.h":
cdef extern from "_hashindex.c":
ctypedef struct HashIndex:
pass
@ -15,6 +15,7 @@ cdef extern from "hashindex.h":
void hashindex_delete(HashIndex *index, void *key)
void hashindex_set(HashIndex *index, void *key, void *value)
_NoDefault = object()
cdef class IndexBase:

View File

@ -67,6 +67,7 @@ class RemoteStore(object):
self.name = name
def __init__(self, location, create=False):
self.p = None
self.cache = LRUCache(256)
self.to_send = ''
self.extra = {}

View File

@ -11,7 +11,7 @@ import unittest
from xattr import xattr, XATTR_NOFOLLOW
from . import helpers, lrucache
from ._speedups import buzhash, buzhash_update, chunkify
from .chunker import chunkify, buzhash, buzhash_update
from .archiver import Archiver
from .key import suite as KeySuite
from .store import Store, suite as StoreSuite

View File

@ -23,7 +23,6 @@ except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from distutils.command.sdist import sdist
hashindex_sources = ['darc/hashindex.pyx', 'darc/_hashindex.c']
try:
from Cython.Distutils import build_ext
@ -57,8 +56,8 @@ setup(name='darc',
packages=['darc'],
cmdclass={'build_ext': build_ext, 'sdist': Sdist},
ext_modules=[
Extension('darc._speedups', ['darc/_speedups.c']),
Extension('darc.hashindex', hashindex_sources)],
scripts = ['scripts/darc'],
Extension('darc.chunker', ['darc/chunker.pyx']),
Extension('darc.hashindex', ['darc/hashindex.pyx'])],
scripts=['scripts/darc'],
)