From 3e5433855cd07613f07a9ce040ffb9745053f127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Tue, 28 May 2013 14:35:55 +0200 Subject: [PATCH] Use Cython for all native code --- .gitignore | 1 + darc/{_speedups.c => _chunker.c} | 197 ++++++++----------------------- darc/_hashindex.c | 44 +++++-- darc/archive.py | 2 +- darc/chunker.pyx | 48 ++++++++ darc/hashindex.h | 28 ----- darc/hashindex.pyx | 3 +- darc/remote.py | 1 + darc/test.py | 2 +- setup.py | 7 +- 10 files changed, 137 insertions(+), 196 deletions(-) rename darc/{_speedups.c => _chunker.c} (60%) create mode 100644 darc/chunker.pyx delete mode 100644 darc/hashindex.h diff --git a/.gitignore b/.gitignore index 021dc356..982aedac 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ build dist env hashindex.c +chunker.c *.egg-info *.pyc *.pyo diff --git a/darc/_speedups.c b/darc/_chunker.c similarity index 60% rename from darc/_speedups.c rename to darc/_chunker.c index d3186d30..cb9fda2e 100644 --- a/darc/_speedups.c +++ b/darc/_chunker.c @@ -1,6 +1,4 @@ #include -#include -#include /* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */ @@ -44,7 +42,7 @@ static uint32_t table_base[] = static uint32_t * -init_buzhash_table(uint32_t seed) +buzhash_init_table(uint32_t seed) { int i; uint32_t *table = malloc(1024); @@ -56,9 +54,9 @@ init_buzhash_table(uint32_t seed) } static uint32_t -buzhash(const unsigned char *data, int len, const uint32_t *h) +buzhash(const unsigned char *data, size_t len, const uint32_t *h) { - int i; + size_t i; uint32_t sum = 0; for(i = len - 1; i > 0; i--) { @@ -69,63 +67,72 @@ buzhash(const unsigned char *data, int len, const uint32_t *h) } static uint32_t -buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h) +buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h) { return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add]; } typedef struct { - PyObject_HEAD - int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size; - size_t bytes_read, bytes_yielded; - uint32_t *h; - PyObject *chunks, *fd; - unsigned char *data; -} ChunkifyIter; + int window_size, chunk_mask, min_size; + size_t buf_size; + uint32_t *table; + uint8_t *data; + PyObject *fd; + int done; + size_t remaining, bytes_read, bytes_yielded, position, last; +} Chunker; -static PyObject* -ChunkifyIter_iter(PyObject *self) +static Chunker * +chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed) { - ChunkifyIter *c = (ChunkifyIter *)self; - c->remaining = 0; - c->position = 0; + Chunker *c = malloc(sizeof(Chunker)); + c->window_size = window_size; + c->chunk_mask = chunk_mask; + c->min_size = min_size; + c->table = buzhash_init_table(seed); + c->buf_size = 10 * 1024 * 1024; + c->data = malloc(c->buf_size); + c->fd = fd; + Py_INCREF(fd); c->done = 0; - c->last = 0; + c->remaining = 0; c->bytes_read = 0; c->bytes_yielded = 0; - Py_INCREF(self); - return self; + c->position = 0; + c->last = 0; + return c; } static void -ChunkifyIter_dealloc(PyObject *self) +chunker_free(Chunker *c) { - ChunkifyIter *c = (ChunkifyIter *)self; Py_DECREF(c->fd); + free(c->table); free(c->data); - free(c->h); - self->ob_type->tp_free(self); + free(c); } -static void -ChunkifyIter_fill(PyObject *self) +static int +chunker_fill(Chunker *c) { - ChunkifyIter *c = (ChunkifyIter *)self; memmove(c->data, c->data + c->last, c->position + c->remaining - c->last); c->position -= c->last; c->last = 0; PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->position - c->remaining); + if(!data) { + return 0; + } int n = PyString_Size(data); memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n); c->remaining += n; c->bytes_read += n; Py_DECREF(data); + return 1; } -static PyObject* -ChunkifyIter_iternext(PyObject *self) +static PyObject * +chunker_process(Chunker *c) { - ChunkifyIter *c = (ChunkifyIter *)self; uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size; int n = 0; @@ -137,7 +144,9 @@ ChunkifyIter_iternext(PyObject *self) return NULL; } if(c->remaining <= window_size) { - ChunkifyIter_fill(self); + if(!chunker_fill(c)) { + return NULL; + } } if(c->remaining < window_size) { c->done = 1; @@ -153,16 +162,18 @@ ChunkifyIter_iternext(PyObject *self) return NULL; } } - sum = buzhash(c->data + c->position, window_size, c->h); + sum = buzhash(c->data + c->position, window_size, c->table); while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) { sum = buzhash_update(sum, c->data[c->position], c->data[c->position + window_size], - window_size, c->h); + window_size, c->table); c->position++; c->remaining--; n++; if(c->remaining <= window_size) { - ChunkifyIter_fill(self); + if(!chunker_fill(c)) { + return NULL; + } } } if(c->remaining <= window_size) { @@ -174,117 +185,5 @@ ChunkifyIter_iternext(PyObject *self) n = c->last - old_last; c->bytes_yielded += n; return PyBuffer_FromMemory(c->data + old_last, n); -} - -static PyTypeObject ChunkifyIterType = { - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "_chunkifier._ChunkifyIter", /*tp_name*/ - sizeof(ChunkifyIter), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - ChunkifyIter_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER, - /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to - use tp_iter and tp_iternext fields. */ - "", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - ChunkifyIter_iter, /* tp_iter: __iter__() method */ - ChunkifyIter_iternext /* tp_iternext: next() method */ -}; - -static PyObject * -chunkify(PyObject *self, PyObject *args) -{ - PyObject *fd; - int seed, window_size, chunk_mask, min_size; - ChunkifyIter *c; - - if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed)) - { - return NULL; - } - if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType))) - { - return NULL; - } - PyObject_Init((PyObject *)c, &ChunkifyIterType); - c->buf_size = 10 * 1024 * 1024; - c->data = malloc(c->buf_size); - c->h = init_buzhash_table(seed & 0xffffffff); - c->fd = fd; - c->window_size = window_size; - c->chunk_mask = chunk_mask; - c->min_size = min_size; - Py_INCREF(fd); - return (PyObject *)c; -} - -static PyObject * -do_buzhash(PyObject *self, PyObject *args) -{ - unsigned char *data; - int len; - unsigned long int seed, sum; - uint32_t *h; - - if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed)) - { - return NULL; - } - h = init_buzhash_table(seed & 0xffffffff); - sum = buzhash(data, len, h); - free(h); - return PyLong_FromUnsignedLong(sum); -} - -static PyObject * -do_buzhash_update(PyObject *self, PyObject *args) -{ - unsigned long int sum, seed; - unsigned char remove, add; - uint32_t *h; - int len; - - if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed)) - { - return NULL; - } - h = init_buzhash_table(seed & 0xffffffff); - sum = buzhash_update(sum, remove, add, len, h); - free(h); - return PyLong_FromUnsignedLong(sum); -} - - -static PyMethodDef ChunkifierMethods[] = { - {"chunkify", chunkify, METH_VARARGS, ""}, - {"buzhash", do_buzhash, METH_VARARGS, ""}, - {"buzhash_update", do_buzhash_update, METH_VARARGS, ""}, - {NULL, NULL, 0, NULL} /* Sentinel */ -}; - -PyMODINIT_FUNC -init_speedups(void) -{ - ChunkifyIterType.tp_new = PyType_GenericNew; - if (PyType_Ready(&ChunkifyIterType) < 0) return; - - Py_InitModule("_speedups", ChunkifierMethods); -} + +} \ No newline at end of file diff --git a/darc/_hashindex.c b/darc/_hashindex.c index f340b63c..98c723ac 100644 --- a/darc/_hashindex.c +++ b/darc/_hashindex.c @@ -9,8 +9,6 @@ #include #include -#include "hashindex.h" - typedef struct { char magic[8]; int32_t num_entries; @@ -19,6 +17,18 @@ typedef struct { int8_t value_size; } __attribute__((__packed__)) HashHeader; +typedef struct { + char *path; + void *map_addr; + off_t map_length; + void *buckets; + int num_entries; + int num_buckets; + int key_size; + int value_size; + int bucket_size; + int limit; +} HashIndex; #define MAGIC "DARCHASH" #define EMPTY ((int32_t)-1) @@ -33,6 +43,16 @@ typedef struct { #define BUCKET_MARK_DELETED(index, idx) (*((int32_t *)(BUCKET_ADDR_WRITE(index, idx) + index->key_size)) = DELETED) +static HashIndex *hashindex_open(const char *path); +static void hashindex_close(HashIndex *index); +static void hashindex_clear(HashIndex *index); +static void hashindex_flush(HashIndex *index); +static HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size); +static const void *hashindex_get(HashIndex *index, const void *key); +static void hashindex_set(HashIndex *index, const void *key, const void *value); +static void hashindex_delete(HashIndex *index, const void *key); +static void *hashindex_next_key(HashIndex *index, const void *key); + /* Private API */ static int @@ -97,7 +117,7 @@ hashindex_resize(HashIndex *index, int capacity) } /* Public API */ -HashIndex * +static HashIndex * hashindex_open(const char *path) { int fd = open(path, O_RDWR); @@ -127,7 +147,7 @@ hashindex_open(const char *path) return index; } -HashIndex * +static HashIndex * hashindex_create(const char *path, int capacity, int key_size, int value_size) { FILE *fd; @@ -160,7 +180,7 @@ error: return NULL; } -void +static void hashindex_clear(HashIndex *index) { int i; @@ -171,7 +191,7 @@ hashindex_clear(HashIndex *index) hashindex_resize(index, 16); } -void +static void hashindex_flush(HashIndex *index) { *((int32_t *)(index->map_addr + 8)) = index->num_entries; @@ -179,7 +199,7 @@ hashindex_flush(HashIndex *index) msync(index->map_addr, index->map_length, MS_SYNC); } -void +static void hashindex_close(HashIndex *index) { hashindex_flush(index); @@ -188,7 +208,7 @@ hashindex_close(HashIndex *index) free(index); } -const void * +static const void * hashindex_get(HashIndex *index, const void *key) { int idx = hashindex_lookup(index, key); @@ -198,7 +218,7 @@ hashindex_get(HashIndex *index, const void *key) return BUCKET_ADDR_READ(index, idx) + index->key_size; } -void +static void hashindex_set(HashIndex *index, const void *key, const void *value) { int idx = hashindex_lookup(index, key); @@ -223,7 +243,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value) } } -void +static void hashindex_delete(HashIndex *index, const void *key) { int idx = hashindex_lookup(index, key); @@ -234,7 +254,7 @@ hashindex_delete(HashIndex *index, const void *key) index->num_entries -= 1; } -void * +static void * hashindex_next_key(HashIndex *index, const void *key) { int idx = 0; @@ -251,7 +271,7 @@ hashindex_next_key(HashIndex *index, const void *key) return BUCKET_ADDR_READ(index, idx); } -int +static int hashindex_get_size(HashIndex *index) { return index->num_entries; diff --git a/darc/archive.py b/darc/archive.py index 862c6990..e2793a43 100644 --- a/darc/archive.py +++ b/darc/archive.py @@ -11,7 +11,7 @@ import time from cStringIO import StringIO from xattr import xattr, XATTR_NOFOLLOW -from ._speedups import chunkify +from .chunker import chunkify from .helpers import uid2user, user2uid, gid2group, group2gid, \ encode_filename, Statistics diff --git a/darc/chunker.pyx b/darc/chunker.pyx new file mode 100644 index 00000000..8342b990 --- /dev/null +++ b/darc/chunker.pyx @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +from libc.stdlib cimport free + +cdef extern from "_chunker.c": + ctypedef int uint32_t + ctypedef struct Chunker: + pass + Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed) + void chunker_free(Chunker *chunker) + object chunker_process(Chunker *chunker) + uint32_t *buzhash_init_table(uint32_t seed) + uint32_t c_buzhash "buzhash"(const unsigned char *data, size_t len, const uint32_t *h) + uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h) + + +cdef class chunkify: + cdef Chunker *chunker + + def __cinit__(self, fd, window_size, chunk_mask, min_size, seed): + self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff) + + def __dealloc__(self): + if self.chunker: + chunker_free(self.chunker) + + def __iter__(self): + return self + + def __next__(self): + return chunker_process(self.chunker) + + +def buzhash(unsigned char *data, unsigned long seed): + cdef uint32_t *table + cdef uint32_t sum + table = buzhash_init_table(seed & 0xffffffff) + sum = c_buzhash(data, len(data), table) + free(table) + return sum + + +def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): + cdef uint32_t *table + table = buzhash_init_table(seed & 0xffffffff) + sum = c_buzhash_update(sum, remove, add, len, table) + free(table) + return sum \ No newline at end of file diff --git a/darc/hashindex.h b/darc/hashindex.h deleted file mode 100644 index 3ab7d348..00000000 --- a/darc/hashindex.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __HASHINDEX_H__ -#define __HASHINDEX_H__ - -typedef struct { - char *path; - void *map_addr; - off_t map_length; - void *buckets; - int num_entries; - int num_buckets; - int key_size; - int value_size; - int bucket_size; - int limit; -} HashIndex; - -HashIndex *hashindex_open(const char *path); -void hashindex_close(HashIndex *index); -void hashindex_clear(HashIndex *index); -void hashindex_flush(HashIndex *index); -HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size); -const void *hashindex_get(HashIndex *index, const void *key); -void hashindex_set(HashIndex *index, const void *key, const void *value); -void hashindex_delete(HashIndex *index, const void *key); -void *hashindex_next_key(HashIndex *index, const void *key); -int hashindex_get_size(HashIndex *index); - -#endif diff --git a/darc/hashindex.pyx b/darc/hashindex.pyx index d5614d51..debd9cf7 100644 --- a/darc/hashindex.pyx +++ b/darc/hashindex.pyx @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -cdef extern from "hashindex.h": +cdef extern from "_hashindex.c": ctypedef struct HashIndex: pass @@ -15,6 +15,7 @@ cdef extern from "hashindex.h": void hashindex_delete(HashIndex *index, void *key) void hashindex_set(HashIndex *index, void *key, void *value) + _NoDefault = object() cdef class IndexBase: diff --git a/darc/remote.py b/darc/remote.py index 396e3c93..3a13e7c2 100644 --- a/darc/remote.py +++ b/darc/remote.py @@ -67,6 +67,7 @@ class RemoteStore(object): self.name = name def __init__(self, location, create=False): + self.p = None self.cache = LRUCache(256) self.to_send = '' self.extra = {} diff --git a/darc/test.py b/darc/test.py index ad93bd43..bb318ddd 100644 --- a/darc/test.py +++ b/darc/test.py @@ -11,7 +11,7 @@ import unittest from xattr import xattr, XATTR_NOFOLLOW from . import helpers, lrucache -from ._speedups import buzhash, buzhash_update, chunkify +from .chunker import chunkify, buzhash, buzhash_update from .archiver import Archiver from .key import suite as KeySuite from .store import Store, suite as StoreSuite diff --git a/setup.py b/setup.py index d2abc5a2..426e9dd9 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,6 @@ except ImportError: from distutils.core import setup from distutils.extension import Extension from distutils.command.sdist import sdist -hashindex_sources = ['darc/hashindex.pyx', 'darc/_hashindex.c'] try: from Cython.Distutils import build_ext @@ -57,8 +56,8 @@ setup(name='darc', packages=['darc'], cmdclass={'build_ext': build_ext, 'sdist': Sdist}, ext_modules=[ - Extension('darc._speedups', ['darc/_speedups.c']), - Extension('darc.hashindex', hashindex_sources)], - scripts = ['scripts/darc'], + Extension('darc.chunker', ['darc/chunker.pyx']), + Extension('darc.hashindex', ['darc/hashindex.pyx'])], + scripts=['scripts/darc'], )