let chunker optionally work with os-level file descriptor

this safes some back-and-forth between C and Python code and also some memory
management overhead as we can always reuse the same read_buf instead of letting
Python allocate and free a up to 10MB big buffer for each buffer filling read.

we can't use os-level file descriptors all the time though, as chunkify gets also invoked
on objects like BytesIO that are not backed by a os-level file.

Note: this changeset is also a preparation for O_DIRECT support which can be
 implemented a lot easier on C level.
This commit is contained in:
Thomas Waldmann 2015-04-08 18:43:53 +02:00
parent d43cb4bac8
commit 7ad1093951
3 changed files with 54 additions and 24 deletions

View File

@ -78,8 +78,9 @@ typedef struct {
int window_size, chunk_mask, min_size;
size_t buf_size;
uint32_t *table;
uint8_t *data;
uint8_t *data, *read_buf;
PyObject *fd;
int fh;
int done, eof;
size_t remaining, bytes_read, bytes_yielded, position, last;
} Chunker;
@ -94,15 +95,17 @@ chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
c->table = buzhash_init_table(seed);
c->buf_size = 10 * 1024 * 1024;
c->data = malloc(c->buf_size);
c->read_buf = malloc(c->buf_size);
return c;
}
static void
chunker_set_fd(Chunker *c, PyObject *fd)
chunker_set_fd(Chunker *c, PyObject *fd, int fh)
{
Py_XDECREF(c->fd);
c->fd = fd;
Py_INCREF(fd);
c->fh = fh;
c->done = 0;
c->remaining = 0;
c->bytes_read = 0;
@ -118,6 +121,7 @@ chunker_free(Chunker *c)
Py_XDECREF(c->fd);
free(c->table);
free(c->data);
free(c->read_buf);
free(c);
}
@ -133,20 +137,40 @@ chunker_fill(Chunker *c)
if(c->eof || n == 0) {
return 1;
}
data = PyObject_CallMethod(c->fd, "read", "i", n);
if(!data) {
return 0;
}
n = PyBytes_Size(data);
if(n) {
memcpy(c->data + c->position + c->remaining, PyBytes_AsString(data), n);
c->remaining += n;
c->bytes_read += n;
if(c->fh >= 0) {
// if we have a os-level file descriptor, use os-level API
n = read(c->fh, c->read_buf, n);
if(n > 0) {
memcpy(c->data + c->position + c->remaining, c->read_buf, n);
c->remaining += n;
c->bytes_read += n;
}
else
if(n == 0) {
c->eof = 1;
}
else {
// some error happened
return 0;
}
}
else {
c->eof = 1;
// no os-level file descriptor, use Python file object API
data = PyObject_CallMethod(c->fd, "read", "i", n);
if(!data) {
return 0;
}
n = PyBytes_Size(data);
if(n) {
memcpy(c->data + c->position + c->remaining, PyBytes_AsString(data), n);
c->remaining += n;
c->bytes_read += n;
}
else {
c->eof = 1;
}
Py_DECREF(data);
}
Py_DECREF(data);
return 1;
}

View File

@ -405,9 +405,10 @@ class Archive:
chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
# Only chunkify the file if needed
if chunks is None:
with Archive._open_rb(path, st) as fd:
fh = Archive._open_rb(path, st)
with os.fdopen(fh, 'rb') as fd:
chunks = []
for chunk in self.chunker.chunkify(fd):
for chunk in self.chunker.chunkify(fd, fh):
chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
cache.memorize_file(path_hash, st, [c[0] for c in chunks])
item = {b'path': safe_path, b'chunks': chunks}
@ -427,12 +428,10 @@ class Archive:
euid = None
def open_simple(p, s):
fd = os.open(p, flags_normal)
return os.fdopen(fd, 'rb')
return os.open(p, flags_normal)
def open_noatime(p, s):
fd = os.open(p, flags_noatime)
return os.fdopen(fd, 'rb')
return os.open(p, flags_noatime)
def open_noatime_if_owner(p, s):
if euid == 0 or s.st_uid == euid:
@ -454,7 +453,7 @@ class Archive:
# So in future, let's check whether the file is owned by us
# before attempting to use O_NOATIME.
Archive._open_rb = open_noatime_if_owner
return os.fdopen(fd, 'rb')
return fd
if flags_noatime != flags_normal:
# Always use O_NOATIME version.

View File

@ -9,7 +9,7 @@ cdef extern from "_chunker.c":
ctypedef struct _Chunker "Chunker":
pass
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
void chunker_set_fd(_Chunker *chunker, object fd)
void chunker_set_fd(_Chunker *chunker, object f, int fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
uint32_t *buzhash_init_table(uint32_t seed)
@ -23,8 +23,15 @@ cdef class Chunker:
def __cinit__(self, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
def chunkify(self, fd):
chunker_set_fd(self.chunker, fd)
def chunkify(self, fd, fh=-1):
"""
Cut a file into chunks.
:param fd: Python file object
:param fh: OS-level file handle (if available),
defaults to -1 which means not to use OS-level fd.
"""
chunker_set_fd(self.chunker, fd, fh)
return self
def __dealloc__(self):
@ -52,4 +59,4 @@ def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t
table = buzhash_init_table(seed & 0xffffffff)
sum = c_buzhash_update(sum, remove, add, len, table)
free(table)
return sum
return sum