mirror of
https://github.com/borgbackup/borg.git
synced 2025-02-22 06:01:54 +00:00
Use Cython for all native code
This commit is contained in:
parent
b994203c01
commit
3e5433855c
10 changed files with 137 additions and 196 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -3,6 +3,7 @@ build
|
|||
dist
|
||||
env
|
||||
hashindex.c
|
||||
chunker.c
|
||||
*.egg-info
|
||||
*.pyc
|
||||
*.pyo
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
#include <Python.h>
|
||||
#include <structmember.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
|
||||
|
||||
|
@ -44,7 +42,7 @@ static uint32_t table_base[] =
|
|||
|
||||
|
||||
static uint32_t *
|
||||
init_buzhash_table(uint32_t seed)
|
||||
buzhash_init_table(uint32_t seed)
|
||||
{
|
||||
int i;
|
||||
uint32_t *table = malloc(1024);
|
||||
|
@ -56,9 +54,9 @@ init_buzhash_table(uint32_t seed)
|
|||
}
|
||||
|
||||
static uint32_t
|
||||
buzhash(const unsigned char *data, int len, const uint32_t *h)
|
||||
buzhash(const unsigned char *data, size_t len, const uint32_t *h)
|
||||
{
|
||||
int i;
|
||||
size_t i;
|
||||
uint32_t sum = 0;
|
||||
for(i = len - 1; i > 0; i--)
|
||||
{
|
||||
|
@ -69,63 +67,72 @@ buzhash(const unsigned char *data, int len, const uint32_t *h)
|
|||
}
|
||||
|
||||
static uint32_t
|
||||
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, int len, const uint32_t *h)
|
||||
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
|
||||
{
|
||||
return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], len) ^ h[add];
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
int window_size, last, done, buf_size, remaining, position, chunk_mask, min_size;
|
||||
size_t bytes_read, bytes_yielded;
|
||||
uint32_t *h;
|
||||
PyObject *chunks, *fd;
|
||||
unsigned char *data;
|
||||
} ChunkifyIter;
|
||||
int window_size, chunk_mask, min_size;
|
||||
size_t buf_size;
|
||||
uint32_t *table;
|
||||
uint8_t *data;
|
||||
PyObject *fd;
|
||||
int done;
|
||||
size_t remaining, bytes_read, bytes_yielded, position, last;
|
||||
} Chunker;
|
||||
|
||||
static PyObject*
|
||||
ChunkifyIter_iter(PyObject *self)
|
||||
static Chunker *
|
||||
chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
|
||||
{
|
||||
ChunkifyIter *c = (ChunkifyIter *)self;
|
||||
c->remaining = 0;
|
||||
c->position = 0;
|
||||
Chunker *c = malloc(sizeof(Chunker));
|
||||
c->window_size = window_size;
|
||||
c->chunk_mask = chunk_mask;
|
||||
c->min_size = min_size;
|
||||
c->table = buzhash_init_table(seed);
|
||||
c->buf_size = 10 * 1024 * 1024;
|
||||
c->data = malloc(c->buf_size);
|
||||
c->fd = fd;
|
||||
Py_INCREF(fd);
|
||||
c->done = 0;
|
||||
c->last = 0;
|
||||
c->remaining = 0;
|
||||
c->bytes_read = 0;
|
||||
c->bytes_yielded = 0;
|
||||
Py_INCREF(self);
|
||||
return self;
|
||||
c->position = 0;
|
||||
c->last = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
static void
|
||||
ChunkifyIter_dealloc(PyObject *self)
|
||||
chunker_free(Chunker *c)
|
||||
{
|
||||
ChunkifyIter *c = (ChunkifyIter *)self;
|
||||
Py_DECREF(c->fd);
|
||||
free(c->table);
|
||||
free(c->data);
|
||||
free(c->h);
|
||||
self->ob_type->tp_free(self);
|
||||
free(c);
|
||||
}
|
||||
|
||||
static void
|
||||
ChunkifyIter_fill(PyObject *self)
|
||||
static int
|
||||
chunker_fill(Chunker *c)
|
||||
{
|
||||
ChunkifyIter *c = (ChunkifyIter *)self;
|
||||
memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
|
||||
c->position -= c->last;
|
||||
c->last = 0;
|
||||
PyObject *data = PyObject_CallMethod(c->fd, "read", "i", c->buf_size - c->position - c->remaining);
|
||||
if(!data) {
|
||||
return 0;
|
||||
}
|
||||
int n = PyString_Size(data);
|
||||
memcpy(c->data + c->position + c->remaining, PyString_AsString(data), n);
|
||||
c->remaining += n;
|
||||
c->bytes_read += n;
|
||||
Py_DECREF(data);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
ChunkifyIter_iternext(PyObject *self)
|
||||
static PyObject *
|
||||
chunker_process(Chunker *c)
|
||||
{
|
||||
ChunkifyIter *c = (ChunkifyIter *)self;
|
||||
uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
|
||||
int n = 0;
|
||||
|
||||
|
@ -137,7 +144,9 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
return NULL;
|
||||
}
|
||||
if(c->remaining <= window_size) {
|
||||
ChunkifyIter_fill(self);
|
||||
if(!chunker_fill(c)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
if(c->remaining < window_size) {
|
||||
c->done = 1;
|
||||
|
@ -153,16 +162,18 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
return NULL;
|
||||
}
|
||||
}
|
||||
sum = buzhash(c->data + c->position, window_size, c->h);
|
||||
sum = buzhash(c->data + c->position, window_size, c->table);
|
||||
while(c->remaining >= c->window_size && ((sum & chunk_mask) || n < min_size)) {
|
||||
sum = buzhash_update(sum, c->data[c->position],
|
||||
c->data[c->position + window_size],
|
||||
window_size, c->h);
|
||||
window_size, c->table);
|
||||
c->position++;
|
||||
c->remaining--;
|
||||
n++;
|
||||
if(c->remaining <= window_size) {
|
||||
ChunkifyIter_fill(self);
|
||||
if(!chunker_fill(c)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(c->remaining <= window_size) {
|
||||
|
@ -174,117 +185,5 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
n = c->last - old_last;
|
||||
c->bytes_yielded += n;
|
||||
return PyBuffer_FromMemory(c->data + old_last, n);
|
||||
}
|
||||
|
||||
static PyTypeObject ChunkifyIterType = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /*ob_size*/
|
||||
"_chunkifier._ChunkifyIter", /*tp_name*/
|
||||
sizeof(ChunkifyIter), /*tp_basicsize*/
|
||||
0, /*tp_itemsize*/
|
||||
ChunkifyIter_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
0, /*tp_getattr*/
|
||||
0, /*tp_setattr*/
|
||||
0, /*tp_compare*/
|
||||
0, /*tp_repr*/
|
||||
0, /*tp_as_number*/
|
||||
0, /*tp_as_sequence*/
|
||||
0, /*tp_as_mapping*/
|
||||
0, /*tp_hash */
|
||||
0, /*tp_call*/
|
||||
0, /*tp_str*/
|
||||
0, /*tp_getattro*/
|
||||
0, /*tp_setattro*/
|
||||
0, /*tp_as_buffer*/
|
||||
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
|
||||
/* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
|
||||
use tp_iter and tp_iternext fields. */
|
||||
"", /* tp_doc */
|
||||
0, /* tp_traverse */
|
||||
0, /* tp_clear */
|
||||
0, /* tp_richcompare */
|
||||
0, /* tp_weaklistoffset */
|
||||
ChunkifyIter_iter, /* tp_iter: __iter__() method */
|
||||
ChunkifyIter_iternext /* tp_iternext: next() method */
|
||||
};
|
||||
|
||||
static PyObject *
|
||||
chunkify(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *fd;
|
||||
int seed, window_size, chunk_mask, min_size;
|
||||
ChunkifyIter *c;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "Oiiii", &fd, &window_size, &chunk_mask, &min_size, &seed))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
if (!(c = PyObject_New(ChunkifyIter, &ChunkifyIterType)))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
PyObject_Init((PyObject *)c, &ChunkifyIterType);
|
||||
c->buf_size = 10 * 1024 * 1024;
|
||||
c->data = malloc(c->buf_size);
|
||||
c->h = init_buzhash_table(seed & 0xffffffff);
|
||||
c->fd = fd;
|
||||
c->window_size = window_size;
|
||||
c->chunk_mask = chunk_mask;
|
||||
c->min_size = min_size;
|
||||
Py_INCREF(fd);
|
||||
return (PyObject *)c;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
do_buzhash(PyObject *self, PyObject *args)
|
||||
{
|
||||
unsigned char *data;
|
||||
int len;
|
||||
unsigned long int seed, sum;
|
||||
uint32_t *h;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "s#k", &data, &len, &seed))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
h = init_buzhash_table(seed & 0xffffffff);
|
||||
sum = buzhash(data, len, h);
|
||||
free(h);
|
||||
return PyLong_FromUnsignedLong(sum);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
do_buzhash_update(PyObject *self, PyObject *args)
|
||||
{
|
||||
unsigned long int sum, seed;
|
||||
unsigned char remove, add;
|
||||
uint32_t *h;
|
||||
int len;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "kbbik", &sum, &remove, &add, &len, &seed))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
h = init_buzhash_table(seed & 0xffffffff);
|
||||
sum = buzhash_update(sum, remove, add, len, h);
|
||||
free(h);
|
||||
return PyLong_FromUnsignedLong(sum);
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef ChunkifierMethods[] = {
|
||||
{"chunkify", chunkify, METH_VARARGS, ""},
|
||||
{"buzhash", do_buzhash, METH_VARARGS, ""},
|
||||
{"buzhash_update", do_buzhash_update, METH_VARARGS, ""},
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC
|
||||
init_speedups(void)
|
||||
{
|
||||
ChunkifyIterType.tp_new = PyType_GenericNew;
|
||||
if (PyType_Ready(&ChunkifyIterType) < 0) return;
|
||||
|
||||
Py_InitModule("_speedups", ChunkifierMethods);
|
||||
}
|
||||
|
||||
}
|
|
@ -9,8 +9,6 @@
|
|||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include "hashindex.h"
|
||||
|
||||
typedef struct {
|
||||
char magic[8];
|
||||
int32_t num_entries;
|
||||
|
@ -19,6 +17,18 @@ typedef struct {
|
|||
int8_t value_size;
|
||||
} __attribute__((__packed__)) HashHeader;
|
||||
|
||||
typedef struct {
|
||||
char *path;
|
||||
void *map_addr;
|
||||
off_t map_length;
|
||||
void *buckets;
|
||||
int num_entries;
|
||||
int num_buckets;
|
||||
int key_size;
|
||||
int value_size;
|
||||
int bucket_size;
|
||||
int limit;
|
||||
} HashIndex;
|
||||
|
||||
#define MAGIC "DARCHASH"
|
||||
#define EMPTY ((int32_t)-1)
|
||||
|
@ -33,6 +43,16 @@ typedef struct {
|
|||
|
||||
#define BUCKET_MARK_DELETED(index, idx) (*((int32_t *)(BUCKET_ADDR_WRITE(index, idx) + index->key_size)) = DELETED)
|
||||
|
||||
static HashIndex *hashindex_open(const char *path);
|
||||
static void hashindex_close(HashIndex *index);
|
||||
static void hashindex_clear(HashIndex *index);
|
||||
static void hashindex_flush(HashIndex *index);
|
||||
static HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
|
||||
static const void *hashindex_get(HashIndex *index, const void *key);
|
||||
static void hashindex_set(HashIndex *index, const void *key, const void *value);
|
||||
static void hashindex_delete(HashIndex *index, const void *key);
|
||||
static void *hashindex_next_key(HashIndex *index, const void *key);
|
||||
|
||||
|
||||
/* Private API */
|
||||
static int
|
||||
|
@ -97,7 +117,7 @@ hashindex_resize(HashIndex *index, int capacity)
|
|||
}
|
||||
|
||||
/* Public API */
|
||||
HashIndex *
|
||||
static HashIndex *
|
||||
hashindex_open(const char *path)
|
||||
{
|
||||
int fd = open(path, O_RDWR);
|
||||
|
@ -127,7 +147,7 @@ hashindex_open(const char *path)
|
|||
return index;
|
||||
}
|
||||
|
||||
HashIndex *
|
||||
static HashIndex *
|
||||
hashindex_create(const char *path, int capacity, int key_size, int value_size)
|
||||
{
|
||||
FILE *fd;
|
||||
|
@ -160,7 +180,7 @@ error:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
hashindex_clear(HashIndex *index)
|
||||
{
|
||||
int i;
|
||||
|
@ -171,7 +191,7 @@ hashindex_clear(HashIndex *index)
|
|||
hashindex_resize(index, 16);
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
hashindex_flush(HashIndex *index)
|
||||
{
|
||||
*((int32_t *)(index->map_addr + 8)) = index->num_entries;
|
||||
|
@ -179,7 +199,7 @@ hashindex_flush(HashIndex *index)
|
|||
msync(index->map_addr, index->map_length, MS_SYNC);
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
hashindex_close(HashIndex *index)
|
||||
{
|
||||
hashindex_flush(index);
|
||||
|
@ -188,7 +208,7 @@ hashindex_close(HashIndex *index)
|
|||
free(index);
|
||||
}
|
||||
|
||||
const void *
|
||||
static const void *
|
||||
hashindex_get(HashIndex *index, const void *key)
|
||||
{
|
||||
int idx = hashindex_lookup(index, key);
|
||||
|
@ -198,7 +218,7 @@ hashindex_get(HashIndex *index, const void *key)
|
|||
return BUCKET_ADDR_READ(index, idx) + index->key_size;
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
hashindex_set(HashIndex *index, const void *key, const void *value)
|
||||
{
|
||||
int idx = hashindex_lookup(index, key);
|
||||
|
@ -223,7 +243,7 @@ hashindex_set(HashIndex *index, const void *key, const void *value)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
static void
|
||||
hashindex_delete(HashIndex *index, const void *key)
|
||||
{
|
||||
int idx = hashindex_lookup(index, key);
|
||||
|
@ -234,7 +254,7 @@ hashindex_delete(HashIndex *index, const void *key)
|
|||
index->num_entries -= 1;
|
||||
}
|
||||
|
||||
void *
|
||||
static void *
|
||||
hashindex_next_key(HashIndex *index, const void *key)
|
||||
{
|
||||
int idx = 0;
|
||||
|
@ -251,7 +271,7 @@ hashindex_next_key(HashIndex *index, const void *key)
|
|||
return BUCKET_ADDR_READ(index, idx);
|
||||
}
|
||||
|
||||
int
|
||||
static int
|
||||
hashindex_get_size(HashIndex *index)
|
||||
{
|
||||
return index->num_entries;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
from cStringIO import StringIO
|
||||
from xattr import xattr, XATTR_NOFOLLOW
|
||||
|
||||
from ._speedups import chunkify
|
||||
from .chunker import chunkify
|
||||
from .helpers import uid2user, user2uid, gid2group, group2gid, \
|
||||
encode_filename, Statistics
|
||||
|
||||
|
|
48
darc/chunker.pyx
Normal file
48
darc/chunker.pyx
Normal file
|
@ -0,0 +1,48 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from libc.stdlib cimport free
|
||||
|
||||
cdef extern from "_chunker.c":
|
||||
ctypedef int uint32_t
|
||||
ctypedef struct Chunker:
|
||||
pass
|
||||
Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
|
||||
void chunker_free(Chunker *chunker)
|
||||
object chunker_process(Chunker *chunker)
|
||||
uint32_t *buzhash_init_table(uint32_t seed)
|
||||
uint32_t c_buzhash "buzhash"(const unsigned char *data, size_t len, const uint32_t *h)
|
||||
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
|
||||
|
||||
|
||||
cdef class chunkify:
|
||||
cdef Chunker *chunker
|
||||
|
||||
def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
|
||||
self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.chunker:
|
||||
chunker_free(self.chunker)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
return chunker_process(self.chunker)
|
||||
|
||||
|
||||
def buzhash(unsigned char *data, unsigned long seed):
|
||||
cdef uint32_t *table
|
||||
cdef uint32_t sum
|
||||
table = buzhash_init_table(seed & 0xffffffff)
|
||||
sum = c_buzhash(data, len(data), table)
|
||||
free(table)
|
||||
return sum
|
||||
|
||||
|
||||
def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
|
||||
cdef uint32_t *table
|
||||
table = buzhash_init_table(seed & 0xffffffff)
|
||||
sum = c_buzhash_update(sum, remove, add, len, table)
|
||||
free(table)
|
||||
return sum
|
|
@ -1,28 +0,0 @@
|
|||
#ifndef __HASHINDEX_H__
|
||||
#define __HASHINDEX_H__
|
||||
|
||||
typedef struct {
|
||||
char *path;
|
||||
void *map_addr;
|
||||
off_t map_length;
|
||||
void *buckets;
|
||||
int num_entries;
|
||||
int num_buckets;
|
||||
int key_size;
|
||||
int value_size;
|
||||
int bucket_size;
|
||||
int limit;
|
||||
} HashIndex;
|
||||
|
||||
HashIndex *hashindex_open(const char *path);
|
||||
void hashindex_close(HashIndex *index);
|
||||
void hashindex_clear(HashIndex *index);
|
||||
void hashindex_flush(HashIndex *index);
|
||||
HashIndex *hashindex_create(const char *path, int capacity, int key_size, int value_size);
|
||||
const void *hashindex_get(HashIndex *index, const void *key);
|
||||
void hashindex_set(HashIndex *index, const void *key, const void *value);
|
||||
void hashindex_delete(HashIndex *index, const void *key);
|
||||
void *hashindex_next_key(HashIndex *index, const void *key);
|
||||
int hashindex_get_size(HashIndex *index);
|
||||
|
||||
#endif
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
cdef extern from "hashindex.h":
|
||||
cdef extern from "_hashindex.c":
|
||||
ctypedef struct HashIndex:
|
||||
pass
|
||||
|
||||
|
@ -15,6 +15,7 @@ cdef extern from "hashindex.h":
|
|||
void hashindex_delete(HashIndex *index, void *key)
|
||||
void hashindex_set(HashIndex *index, void *key, void *value)
|
||||
|
||||
|
||||
_NoDefault = object()
|
||||
|
||||
cdef class IndexBase:
|
||||
|
|
|
@ -67,6 +67,7 @@ def __init__(self, name):
|
|||
self.name = name
|
||||
|
||||
def __init__(self, location, create=False):
|
||||
self.p = None
|
||||
self.cache = LRUCache(256)
|
||||
self.to_send = ''
|
||||
self.extra = {}
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
from xattr import xattr, XATTR_NOFOLLOW
|
||||
|
||||
from . import helpers, lrucache
|
||||
from ._speedups import buzhash, buzhash_update, chunkify
|
||||
from .chunker import chunkify, buzhash, buzhash_update
|
||||
from .archiver import Archiver
|
||||
from .key import suite as KeySuite
|
||||
from .store import Store, suite as StoreSuite
|
||||
|
|
7
setup.py
7
setup.py
|
@ -23,7 +23,6 @@
|
|||
from distutils.core import setup
|
||||
from distutils.extension import Extension
|
||||
from distutils.command.sdist import sdist
|
||||
hashindex_sources = ['darc/hashindex.pyx', 'darc/_hashindex.c']
|
||||
|
||||
try:
|
||||
from Cython.Distutils import build_ext
|
||||
|
@ -57,8 +56,8 @@ def make_distribution(self):
|
|||
packages=['darc'],
|
||||
cmdclass={'build_ext': build_ext, 'sdist': Sdist},
|
||||
ext_modules=[
|
||||
Extension('darc._speedups', ['darc/_speedups.c']),
|
||||
Extension('darc.hashindex', hashindex_sources)],
|
||||
scripts = ['scripts/darc'],
|
||||
Extension('darc.chunker', ['darc/chunker.pyx']),
|
||||
Extension('darc.hashindex', ['darc/hashindex.pyx'])],
|
||||
scripts=['scripts/darc'],
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in a new issue