Reuse chunker buffer between files.

This commit is contained in:
Jonas Borgström 2014-08-03 15:04:41 +02:00
parent 193fb1fcd5
commit 9f64e39d9f
5 changed files with 43 additions and 29 deletions

View File

@ -85,15 +85,22 @@ typedef struct {
} Chunker;
static Chunker *
chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
{
Chunker *c = malloc(sizeof(Chunker));
Chunker *c = calloc(sizeof(Chunker), 1);
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->buf_size = 10 * 1024 * 1024;
c->data = malloc(c->buf_size);
return c;
}
static void
chunker_set_fd(Chunker *c, PyObject *fd)
{
Py_XDECREF(c->fd);
c->fd = fd;
Py_INCREF(fd);
c->done = 0;
@ -103,13 +110,12 @@ chunker_init(PyObject *fd, int window_size, int chunk_mask, int min_size, uint32
c->position = 0;
c->last = 0;
c->eof = 0;
return c;
}
static void
chunker_free(Chunker *c)
{
Py_DECREF(c->fd);
Py_XDECREF(c->fd);
free(c->table);
free(c->data);
free(c);

View File

@ -15,7 +15,7 @@ import time
from io import BytesIO
from attic import xattr
from attic.platform import acl_get, acl_set
from attic.chunker import chunkify
from attic.chunker import Chunker
from attic.hashindex import ChunkIndex
from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \
Manifest, Statistics, decode_dict, st_mtime_ns, make_path_safe, StableDict, int_to_bigint, bigint_to_int
@ -65,6 +65,7 @@ class ChunkBuffer:
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = []
self.key = key
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
def add(self, item):
self.buffer.write(self.packer.pack(StableDict(item)))
@ -78,7 +79,7 @@ class ChunkBuffer:
if self.buffer.tell() == 0:
return
self.buffer.seek(0)
chunks = list(bytes(s) for s in chunkify(self.buffer, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed))
chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer))
self.buffer.seek(0)
self.buffer.truncate(0)
# Leave the last parital chunk in the buffer unless flush is True
@ -126,6 +127,7 @@ class Archive:
self.numeric_owner = numeric_owner
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.pipeline = DownloadPipeline(self.repository, self.key)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
if create:
if name in manifest.archives:
raise self.AlreadyExists(name)
@ -399,7 +401,7 @@ class Archive:
if chunks is None:
with open(path, 'rb') as fd:
chunks = []
for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed):
for chunk in self.chunker.chunkify(fd):
chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats))
cache.memorize_file(path_hash, st, [c[0] for c in chunks])
item = {b'path': safe_path, b'chunks': chunks}

View File

@ -1,26 +1,31 @@
# -*- coding: utf-8 -*-
API_VERSION = 1
API_VERSION = 2
from libc.stdlib cimport free
cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct Chunker:
ctypedef struct _Chunker "Chunker":
pass
Chunker *chunker_init(object fd, int window_size, int chunk_mask, int min_size, uint32_t seed)
void chunker_free(Chunker *chunker)
object chunker_process(Chunker *chunker)
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
void chunker_set_fd(_Chunker *chunker, object fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
uint32_t *buzhash_init_table(uint32_t seed)
uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
cdef class chunkify:
cdef Chunker *chunker
cdef class Chunker:
cdef _Chunker *chunker
def __cinit__(self, fd, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(fd, window_size, chunk_mask, min_size, seed & 0xffffffff)
def __cinit__(self, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
def chunkify(self, fd):
chunker_set_fd(self.chunker, fd)
return self
def __dealloc__(self):
if self.chunker:

View File

@ -74,7 +74,7 @@ class UpgradableLock:
def check_extension_modules():
import attic.platform
if (attic.hashindex.API_VERSION != 2 or
attic.chunker.API_VERSION != 1 or
attic.chunker.API_VERSION != 2 or
attic.crypto.API_VERSION != 2 or
attic.platform.API_VERSION != 2):
raise ExtensionModuleError
@ -577,3 +577,4 @@ def int_to_bigint(value):
if value.bit_length() > 63:
return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True)
return value

View File

@ -1,4 +1,4 @@
from attic.chunker import chunkify, buzhash, buzhash_update
from attic.chunker import Chunker, buzhash, buzhash_update
from attic.testsuite import AtticTestCase
from io import BytesIO
@ -7,19 +7,19 @@ class ChunkerTestCase(AtticTestCase):
def test_chunkify(self):
data = b'0' * 1024 * 1024 * 15 + b'Y'
parts = [bytes(c) for c in chunkify(BytesIO(data), 2, 0x3, 2, 0)]
parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b''), 2, 0x3, 2, 0)], [])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 0)], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 1)], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 2)], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 0)], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 1)], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 2)], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 0)], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 1)], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 2)], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)