2010-10-13 20:07:55 +00:00
|
|
|
#include <Python.h>
|
2015-04-10 23:09:03 +00:00
|
|
|
#include <fcntl.h>
|
2010-02-28 19:22:45 +00:00
|
|
|
|
2012-12-18 19:38:16 +00:00
|
|
|
/* Cyclic polynomial / buzhash: https://en.wikipedia.org/wiki/Rolling_hash */
|
2010-11-04 20:19:01 +00:00
|
|
|
|
2012-12-18 19:38:16 +00:00
|
|
|
static uint32_t table_base[] =
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2012-12-18 19:38:16 +00:00
|
|
|
0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
|
|
|
|
0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
|
|
|
|
0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
|
|
|
|
0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
|
|
|
|
0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
|
|
|
|
0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
|
|
|
|
0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
|
|
|
|
0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
|
|
|
|
0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
|
|
|
|
0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
|
|
|
|
0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
|
|
|
|
0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
|
|
|
|
0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
|
|
|
|
0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
|
|
|
|
0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
|
|
|
|
0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
|
|
|
|
0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
|
|
|
|
0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
|
|
|
|
0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
|
2014-03-30 15:43:31 +00:00
|
|
|
0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
|
2012-12-18 19:38:16 +00:00
|
|
|
0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
|
|
|
|
0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
|
|
|
|
0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
|
|
|
|
0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
|
|
|
|
0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
|
|
|
|
0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
|
|
|
|
0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
|
|
|
|
0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
|
|
|
|
0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
|
|
|
|
0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
|
|
|
|
0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
|
|
|
|
0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
|
|
|
|
};
|
|
|
|
|
2014-05-13 17:52:42 +00:00
|
|
|
#define BARREL_SHIFT(v, shift) ( ((v) << shift) | ((v) >> (32 - shift)) )
|
2012-12-18 19:38:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
static uint32_t *
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash_init_table(uint32_t seed)
|
2012-12-18 19:38:16 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
uint32_t *table = malloc(1024);
|
|
|
|
for(i = 0; i < 256; i++)
|
|
|
|
{
|
|
|
|
table[i] = table_base[i] ^ seed;
|
|
|
|
}
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash(const unsigned char *data, size_t len, const uint32_t *h)
|
2012-12-18 19:38:16 +00:00
|
|
|
{
|
2014-05-13 17:52:42 +00:00
|
|
|
uint32_t i;
|
|
|
|
uint32_t sum = 0, imod;
|
2012-12-18 19:38:16 +00:00
|
|
|
for(i = len - 1; i > 0; i--)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2014-05-13 17:52:42 +00:00
|
|
|
imod = i & 0x1f;
|
|
|
|
sum ^= BARREL_SHIFT(h[*data], imod);
|
2012-12-18 19:38:16 +00:00
|
|
|
data++;
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
return sum ^ h[*data];
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
2012-12-18 19:38:16 +00:00
|
|
|
static uint32_t
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2014-05-13 17:52:42 +00:00
|
|
|
uint32_t lenmod = len & 0x1f;
|
|
|
|
return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add];
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
2010-02-28 19:22:45 +00:00
|
|
|
typedef struct {
|
2013-05-28 12:35:55 +00:00
|
|
|
int window_size, chunk_mask, min_size;
|
|
|
|
size_t buf_size;
|
|
|
|
uint32_t *table;
|
2015-05-31 16:41:23 +00:00
|
|
|
uint8_t *data;
|
2013-05-28 12:35:55 +00:00
|
|
|
PyObject *fd;
|
2015-04-08 16:43:53 +00:00
|
|
|
int fh;
|
2013-06-22 17:51:07 +00:00
|
|
|
int done, eof;
|
2015-08-20 16:19:48 +00:00
|
|
|
size_t remaining, position, last;
|
|
|
|
off_t bytes_read, bytes_yielded;
|
2013-05-28 12:35:55 +00:00
|
|
|
} Chunker;
|
2010-02-28 19:22:45 +00:00
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static Chunker *
|
simple sparse file support, made chunk buffer size flexible
Implemented sparse file support to remove this blocker for people backing up lots of
huge sparse files (like VM images). Attic could not support this use case yet as it would
have restored all files to their fully expanded size, possibly running out of disk space if
the total expanded size would be bigger than the available space.
Please note that this is a very simple implementation of sparse file support - at backup time,
it does not do anything special (it just reads all these zero bytes, chunks, compresses and
encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros
and does a seek on the output file rather than a normal data write, so it creates a hole in
a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes
of multiples of that size (depends also a bit on fs block size, alignment, previously written data).
Special cases like sparse files starting and/or ending with a hole are supported.
Please note that it will currently always create sparse files at restore time if it detects all-zero
chunks.
Also improved:
I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also
existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk
buffer size).
Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily.
2015-04-15 14:29:18 +00:00
|
|
|
chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2014-08-03 13:04:41 +00:00
|
|
|
Chunker *c = calloc(sizeof(Chunker), 1);
|
2013-05-28 12:35:55 +00:00
|
|
|
c->window_size = window_size;
|
|
|
|
c->chunk_mask = chunk_mask;
|
|
|
|
c->min_size = min_size;
|
|
|
|
c->table = buzhash_init_table(seed);
|
simple sparse file support, made chunk buffer size flexible
Implemented sparse file support to remove this blocker for people backing up lots of
huge sparse files (like VM images). Attic could not support this use case yet as it would
have restored all files to their fully expanded size, possibly running out of disk space if
the total expanded size would be bigger than the available space.
Please note that this is a very simple implementation of sparse file support - at backup time,
it does not do anything special (it just reads all these zero bytes, chunks, compresses and
encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros
and does a seek on the output file rather than a normal data write, so it creates a hole in
a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes
of multiples of that size (depends also a bit on fs block size, alignment, previously written data).
Special cases like sparse files starting and/or ending with a hole are supported.
Please note that it will currently always create sparse files at restore time if it detects all-zero
chunks.
Also improved:
I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also
existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk
buffer size).
Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily.
2015-04-15 14:29:18 +00:00
|
|
|
c->buf_size = max_size;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->data = malloc(c->buf_size);
|
2015-08-20 14:55:12 +00:00
|
|
|
c->fh = -1;
|
2014-08-03 13:04:41 +00:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2015-04-08 16:43:53 +00:00
|
|
|
chunker_set_fd(Chunker *c, PyObject *fd, int fh)
|
2014-08-03 13:04:41 +00:00
|
|
|
{
|
|
|
|
Py_XDECREF(c->fd);
|
2013-05-28 12:35:55 +00:00
|
|
|
c->fd = fd;
|
|
|
|
Py_INCREF(fd);
|
2015-04-08 16:43:53 +00:00
|
|
|
c->fh = fh;
|
2010-10-13 20:07:55 +00:00
|
|
|
c->done = 0;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->remaining = 0;
|
2012-12-18 19:38:16 +00:00
|
|
|
c->bytes_read = 0;
|
|
|
|
c->bytes_yielded = 0;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->position = 0;
|
|
|
|
c->last = 0;
|
2013-06-22 17:51:07 +00:00
|
|
|
c->eof = 0;
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-28 12:35:55 +00:00
|
|
|
chunker_free(Chunker *c)
|
2010-02-28 19:22:45 +00:00
|
|
|
{
|
2014-08-03 13:04:41 +00:00
|
|
|
Py_XDECREF(c->fd);
|
2013-05-28 12:35:55 +00:00
|
|
|
free(c->table);
|
2010-03-03 21:52:57 +00:00
|
|
|
free(c->data);
|
2013-05-28 12:35:55 +00:00
|
|
|
free(c);
|
2010-02-28 19:22:45 +00:00
|
|
|
}
|
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static int
|
|
|
|
chunker_fill(Chunker *c)
|
2010-12-16 21:56:04 +00:00
|
|
|
{
|
2015-07-30 13:21:13 +00:00
|
|
|
ssize_t n;
|
2015-09-06 20:06:52 +00:00
|
|
|
off_t offset, length;
|
2014-03-30 15:43:31 +00:00
|
|
|
PyObject *data;
|
2010-12-16 21:56:04 +00:00
|
|
|
memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
|
|
|
|
c->position -= c->last;
|
|
|
|
c->last = 0;
|
2013-06-22 19:24:17 +00:00
|
|
|
n = c->buf_size - c->position - c->remaining;
|
|
|
|
if(c->eof || n == 0) {
|
2013-06-22 17:51:07 +00:00
|
|
|
return 1;
|
|
|
|
}
|
2015-04-08 16:43:53 +00:00
|
|
|
if(c->fh >= 0) {
|
2015-08-20 03:33:51 +00:00
|
|
|
offset = c->bytes_read;
|
2015-04-08 16:43:53 +00:00
|
|
|
// if we have a os-level file descriptor, use os-level API
|
2015-05-31 16:41:23 +00:00
|
|
|
n = read(c->fh, c->data + c->position + c->remaining, n);
|
2015-04-08 16:43:53 +00:00
|
|
|
if(n > 0) {
|
|
|
|
c->remaining += n;
|
|
|
|
c->bytes_read += n;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if(n == 0) {
|
|
|
|
c->eof = 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// some error happened
|
|
|
|
return 0;
|
|
|
|
}
|
2015-08-20 03:33:51 +00:00
|
|
|
length = c->bytes_read - offset;
|
2015-09-14 15:36:04 +00:00
|
|
|
#if ( ( _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L ) && defined(POSIX_FADV_DONTNEED) )
|
2015-08-20 03:33:51 +00:00
|
|
|
// We tell the OS that we do not need the data that we just have read any
|
|
|
|
// more (that it maybe has in the cache). This avoids that we spoil the
|
2015-04-10 23:09:03 +00:00
|
|
|
// complete cache with data that we only read once and (due to cache
|
|
|
|
// size limit) kick out data from the cache that might be still useful
|
|
|
|
// for the OS or other processes.
|
2015-08-20 03:33:51 +00:00
|
|
|
if (length > 0) {
|
2015-08-20 16:48:59 +00:00
|
|
|
posix_fadvise(c->fh, offset, length, POSIX_FADV_DONTNEED);
|
2015-08-20 03:33:51 +00:00
|
|
|
}
|
2015-04-10 23:09:03 +00:00
|
|
|
#endif
|
2013-06-22 17:51:07 +00:00
|
|
|
}
|
|
|
|
else {
|
2015-04-08 16:43:53 +00:00
|
|
|
// no os-level file descriptor, use Python file object API
|
|
|
|
data = PyObject_CallMethod(c->fd, "read", "i", n);
|
|
|
|
if(!data) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
n = PyBytes_Size(data);
|
|
|
|
if(n) {
|
|
|
|
memcpy(c->data + c->position + c->remaining, PyBytes_AsString(data), n);
|
|
|
|
c->remaining += n;
|
|
|
|
c->bytes_read += n;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
c->eof = 1;
|
|
|
|
}
|
|
|
|
Py_DECREF(data);
|
2013-06-22 17:51:07 +00:00
|
|
|
}
|
2013-05-28 12:35:55 +00:00
|
|
|
return 1;
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
|
|
|
|
2014-02-09 20:25:05 +00:00
|
|
|
static PyObject *
|
2013-06-03 11:45:48 +00:00
|
|
|
PyBuffer_FromMemory(void *data, Py_ssize_t len)
|
|
|
|
{
|
|
|
|
Py_buffer buffer;
|
|
|
|
PyObject *mv;
|
|
|
|
|
|
|
|
PyBuffer_FillInfo(&buffer, NULL, data, len, 1, PyBUF_CONTIG_RO);
|
|
|
|
mv = PyMemoryView_FromBuffer(&buffer);
|
|
|
|
PyBuffer_Release(&buffer);
|
|
|
|
return mv;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static PyObject *
|
|
|
|
chunker_process(Chunker *c)
|
2010-02-28 19:22:45 +00:00
|
|
|
{
|
2012-12-18 19:38:16 +00:00
|
|
|
uint32_t sum, chunk_mask = c->chunk_mask, min_size = c->min_size, window_size = c->window_size;
|
|
|
|
int n = 0;
|
2014-03-30 15:43:31 +00:00
|
|
|
int old_last;
|
2010-11-04 20:19:01 +00:00
|
|
|
|
2010-12-16 21:56:04 +00:00
|
|
|
if(c->done) {
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->bytes_read == c->bytes_yielded)
|
|
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
|
|
else
|
|
|
|
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
|
2010-03-03 21:52:57 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->remaining <= window_size) {
|
2013-05-28 12:35:55 +00:00
|
|
|
if(!chunker_fill(c)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->remaining < window_size) {
|
2010-12-16 21:56:04 +00:00
|
|
|
c->done = 1;
|
2011-07-17 22:00:03 +00:00
|
|
|
if(c->remaining) {
|
2012-12-18 19:38:16 +00:00
|
|
|
c->bytes_yielded += c->remaining;
|
2011-07-17 22:00:03 +00:00
|
|
|
return PyBuffer_FromMemory(c->data + c->position, c->remaining);
|
|
|
|
}
|
|
|
|
else {
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->bytes_read == c->bytes_yielded)
|
|
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
|
|
else
|
|
|
|
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
|
2011-07-17 22:00:03 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
2013-05-28 12:35:55 +00:00
|
|
|
sum = buzhash(c->data + c->position, window_size, c->table);
|
2013-06-22 17:51:07 +00:00
|
|
|
while(c->remaining > c->window_size && ((sum & chunk_mask) || n < min_size)) {
|
2012-12-18 19:38:16 +00:00
|
|
|
sum = buzhash_update(sum, c->data[c->position],
|
|
|
|
c->data[c->position + window_size],
|
2013-05-28 12:35:55 +00:00
|
|
|
window_size, c->table);
|
2010-12-16 21:56:04 +00:00
|
|
|
c->position++;
|
|
|
|
c->remaining--;
|
2012-12-18 19:38:16 +00:00
|
|
|
n++;
|
|
|
|
if(c->remaining <= window_size) {
|
2013-05-28 12:35:55 +00:00
|
|
|
if(!chunker_fill(c)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-03-03 22:27:40 +00:00
|
|
|
}
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->remaining <= window_size) {
|
|
|
|
c->position += c->remaining;
|
|
|
|
c->remaining = 0;
|
|
|
|
}
|
2014-03-30 15:43:31 +00:00
|
|
|
old_last = c->last;
|
2010-12-16 21:56:04 +00:00
|
|
|
c->last = c->position;
|
2012-12-18 19:38:16 +00:00
|
|
|
n = c->last - old_last;
|
|
|
|
c->bytes_yielded += n;
|
|
|
|
return PyBuffer_FromMemory(c->data + old_last, n);
|
2013-06-03 11:45:48 +00:00
|
|
|
}
|