2010-10-13 20:07:55 +00:00
|
|
|
#include <Python.h>
|
2015-04-10 23:09:03 +00:00
|
|
|
#include <fcntl.h>
|
2023-11-05 10:48:16 +00:00
|
|
|
#if !defined(_MSC_VER)
|
|
|
|
# include <unistd.h>
|
|
|
|
#endif
|
2010-02-28 19:22:45 +00:00
|
|
|
|
2016-04-16 21:27:01 +00:00
|
|
|
/* Cyclic polynomial / buzhash
|
|
|
|
|
|
|
|
https://en.wikipedia.org/wiki/Rolling_hash
|
|
|
|
|
|
|
|
http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor)
|
|
|
|
|
|
|
|
http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide)
|
|
|
|
|
|
|
|
Some properties of buzhash / of this implementation:
|
|
|
|
|
|
|
|
(1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window;
|
|
|
|
any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within
|
|
|
|
the hash function, e.g. in "X <any 31 bytes> X", the last X would cancel out the influence
|
|
|
|
of the first X on the hash value.
|
|
|
|
|
|
|
|
(2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of
|
|
|
|
0/1 bit values per position, but the hard coded table below doesn't fit that property.
|
|
|
|
|
|
|
|
(3) if you would use a window size divisible by 64, the seed would cancel itself out completely.
|
|
|
|
this is why we use a window size of 4095 bytes.
|
|
|
|
|
|
|
|
Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant
|
|
|
|
is equivalent to XORing the hash output with a different constant. but since the seed is stored
|
|
|
|
encrypted, i think it still serves its purpose.
|
|
|
|
*/
|
2010-11-04 20:19:01 +00:00
|
|
|
|
2012-12-18 19:38:16 +00:00
|
|
|
static uint32_t table_base[] =
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2012-12-18 19:38:16 +00:00
|
|
|
0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4,
|
|
|
|
0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436,
|
|
|
|
0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7,
|
|
|
|
0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a,
|
|
|
|
0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613,
|
|
|
|
0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0,
|
|
|
|
0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205,
|
|
|
|
0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7,
|
|
|
|
0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035,
|
|
|
|
0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d,
|
|
|
|
0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355,
|
|
|
|
0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f,
|
|
|
|
0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424,
|
|
|
|
0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86,
|
|
|
|
0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f,
|
|
|
|
0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf,
|
|
|
|
0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9,
|
|
|
|
0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186,
|
|
|
|
0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0,
|
2014-03-30 15:43:31 +00:00
|
|
|
0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35,
|
2012-12-18 19:38:16 +00:00
|
|
|
0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd,
|
|
|
|
0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4,
|
|
|
|
0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f,
|
|
|
|
0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec,
|
|
|
|
0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889,
|
|
|
|
0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4,
|
|
|
|
0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512,
|
|
|
|
0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009,
|
|
|
|
0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2,
|
|
|
|
0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c,
|
|
|
|
0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9,
|
|
|
|
0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b
|
|
|
|
};
|
|
|
|
|
2017-12-13 03:01:59 +00:00
|
|
|
#define BARREL_SHIFT(v, shift) ( ((v) << (shift)) | ((v) >> ((32 - (shift)) & 0x1f)) )
|
2012-12-18 19:38:16 +00:00
|
|
|
|
2016-04-15 23:27:44 +00:00
|
|
|
size_t pagemask;
|
2012-12-18 19:38:16 +00:00
|
|
|
|
|
|
|
static uint32_t *
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash_init_table(uint32_t seed)
|
2012-12-18 19:38:16 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
uint32_t *table = malloc(1024);
|
|
|
|
for(i = 0; i < 256; i++)
|
|
|
|
{
|
|
|
|
table[i] = table_base[i] ^ seed;
|
|
|
|
}
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash(const unsigned char *data, size_t len, const uint32_t *h)
|
2012-12-18 19:38:16 +00:00
|
|
|
{
|
2014-05-13 17:52:42 +00:00
|
|
|
uint32_t i;
|
|
|
|
uint32_t sum = 0, imod;
|
2012-12-18 19:38:16 +00:00
|
|
|
for(i = len - 1; i > 0; i--)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2014-05-13 17:52:42 +00:00
|
|
|
imod = i & 0x1f;
|
|
|
|
sum ^= BARREL_SHIFT(h[*data], imod);
|
2012-12-18 19:38:16 +00:00
|
|
|
data++;
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
return sum ^ h[*data];
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
2012-12-18 19:38:16 +00:00
|
|
|
static uint32_t
|
2013-05-28 12:35:55 +00:00
|
|
|
buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2016-05-21 23:22:52 +00:00
|
|
|
uint32_t lenmod = len & 0x1f; /* Note: replace by constant to get small speedup */
|
2014-05-13 17:52:42 +00:00
|
|
|
return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add];
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
2010-02-28 19:22:45 +00:00
|
|
|
typedef struct {
|
2015-11-21 21:08:30 +00:00
|
|
|
uint32_t chunk_mask;
|
2013-05-28 12:35:55 +00:00
|
|
|
uint32_t *table;
|
2015-05-31 16:41:23 +00:00
|
|
|
uint8_t *data;
|
2013-05-28 12:35:55 +00:00
|
|
|
PyObject *fd;
|
2015-04-08 16:43:53 +00:00
|
|
|
int fh;
|
2013-06-22 17:51:07 +00:00
|
|
|
int done, eof;
|
2015-11-21 21:08:30 +00:00
|
|
|
size_t min_size, buf_size, window_size, remaining, position, last;
|
2015-08-20 16:19:48 +00:00
|
|
|
off_t bytes_read, bytes_yielded;
|
2013-05-28 12:35:55 +00:00
|
|
|
} Chunker;
|
2010-02-28 19:22:45 +00:00
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static Chunker *
|
2015-11-21 21:08:30 +00:00
|
|
|
chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed)
|
2010-03-03 21:52:57 +00:00
|
|
|
{
|
2014-08-03 13:04:41 +00:00
|
|
|
Chunker *c = calloc(sizeof(Chunker), 1);
|
2013-05-28 12:35:55 +00:00
|
|
|
c->window_size = window_size;
|
|
|
|
c->chunk_mask = chunk_mask;
|
|
|
|
c->min_size = min_size;
|
|
|
|
c->table = buzhash_init_table(seed);
|
simple sparse file support, made chunk buffer size flexible
Implemented sparse file support to remove this blocker for people backing up lots of
huge sparse files (like VM images). Attic could not support this use case yet as it would
have restored all files to their fully expanded size, possibly running out of disk space if
the total expanded size would be bigger than the available space.
Please note that this is a very simple implementation of sparse file support - at backup time,
it does not do anything special (it just reads all these zero bytes, chunks, compresses and
encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros
and does a seek on the output file rather than a normal data write, so it creates a hole in
a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes
of multiples of that size (depends also a bit on fs block size, alignment, previously written data).
Special cases like sparse files starting and/or ending with a hole are supported.
Please note that it will currently always create sparse files at restore time if it detects all-zero
chunks.
Also improved:
I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also
existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk
buffer size).
Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily.
2015-04-15 14:29:18 +00:00
|
|
|
c->buf_size = max_size;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->data = malloc(c->buf_size);
|
2015-08-20 14:55:12 +00:00
|
|
|
c->fh = -1;
|
2014-08-03 13:04:41 +00:00
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2015-04-08 16:43:53 +00:00
|
|
|
chunker_set_fd(Chunker *c, PyObject *fd, int fh)
|
2014-08-03 13:04:41 +00:00
|
|
|
{
|
|
|
|
Py_XDECREF(c->fd);
|
2013-05-28 12:35:55 +00:00
|
|
|
c->fd = fd;
|
|
|
|
Py_INCREF(fd);
|
2015-04-08 16:43:53 +00:00
|
|
|
c->fh = fh;
|
2010-10-13 20:07:55 +00:00
|
|
|
c->done = 0;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->remaining = 0;
|
2012-12-18 19:38:16 +00:00
|
|
|
c->bytes_read = 0;
|
|
|
|
c->bytes_yielded = 0;
|
2013-05-28 12:35:55 +00:00
|
|
|
c->position = 0;
|
|
|
|
c->last = 0;
|
2013-06-22 17:51:07 +00:00
|
|
|
c->eof = 0;
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-28 12:35:55 +00:00
|
|
|
chunker_free(Chunker *c)
|
2010-02-28 19:22:45 +00:00
|
|
|
{
|
2014-08-03 13:04:41 +00:00
|
|
|
Py_XDECREF(c->fd);
|
2013-05-28 12:35:55 +00:00
|
|
|
free(c->table);
|
2010-03-03 21:52:57 +00:00
|
|
|
free(c->data);
|
2013-05-28 12:35:55 +00:00
|
|
|
free(c);
|
2010-02-28 19:22:45 +00:00
|
|
|
}
|
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static int
|
|
|
|
chunker_fill(Chunker *c)
|
2010-12-16 21:56:04 +00:00
|
|
|
{
|
2015-07-30 13:21:13 +00:00
|
|
|
ssize_t n;
|
2014-03-30 15:43:31 +00:00
|
|
|
PyObject *data;
|
2017-03-30 11:09:02 +00:00
|
|
|
PyThreadState *thread_state;
|
|
|
|
|
2010-12-16 21:56:04 +00:00
|
|
|
memmove(c->data, c->data + c->last, c->position + c->remaining - c->last);
|
|
|
|
c->position -= c->last;
|
|
|
|
c->last = 0;
|
2013-06-22 19:24:17 +00:00
|
|
|
n = c->buf_size - c->position - c->remaining;
|
|
|
|
if(c->eof || n == 0) {
|
2013-06-22 17:51:07 +00:00
|
|
|
return 1;
|
|
|
|
}
|
2015-04-08 16:43:53 +00:00
|
|
|
if(c->fh >= 0) {
|
2017-03-30 11:09:02 +00:00
|
|
|
thread_state = PyEval_SaveThread();
|
|
|
|
|
2022-08-04 08:46:36 +00:00
|
|
|
#if ( ( _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L ) && defined(POSIX_FADV_DONTNEED) )
|
|
|
|
off_t offset = c->bytes_read;
|
|
|
|
#endif
|
|
|
|
|
2015-04-08 16:43:53 +00:00
|
|
|
// if we have a os-level file descriptor, use os-level API
|
2015-05-31 16:41:23 +00:00
|
|
|
n = read(c->fh, c->data + c->position + c->remaining, n);
|
2015-04-08 16:43:53 +00:00
|
|
|
if(n > 0) {
|
|
|
|
c->remaining += n;
|
|
|
|
c->bytes_read += n;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if(n == 0) {
|
|
|
|
c->eof = 1;
|
|
|
|
}
|
|
|
|
else {
|
2017-03-30 11:09:02 +00:00
|
|
|
PyEval_RestoreThread(thread_state);
|
2015-04-08 16:43:53 +00:00
|
|
|
// some error happened
|
2016-03-14 19:25:30 +00:00
|
|
|
PyErr_SetFromErrno(PyExc_OSError);
|
2015-04-08 16:43:53 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2015-09-14 15:36:04 +00:00
|
|
|
#if ( ( _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L ) && defined(POSIX_FADV_DONTNEED) )
|
2022-08-04 08:46:36 +00:00
|
|
|
off_t length = c->bytes_read - offset;
|
2016-04-15 23:27:44 +00:00
|
|
|
|
2016-05-07 16:18:40 +00:00
|
|
|
// Only do it once per run.
|
|
|
|
if (pagemask == 0)
|
|
|
|
pagemask = getpagesize() - 1;
|
2016-04-15 23:27:44 +00:00
|
|
|
|
2015-08-20 03:33:51 +00:00
|
|
|
// We tell the OS that we do not need the data that we just have read any
|
|
|
|
// more (that it maybe has in the cache). This avoids that we spoil the
|
2015-04-10 23:09:03 +00:00
|
|
|
// complete cache with data that we only read once and (due to cache
|
|
|
|
// size limit) kick out data from the cache that might be still useful
|
|
|
|
// for the OS or other processes.
|
2016-04-15 23:27:44 +00:00
|
|
|
// We rollback the initial offset back to the start of the page,
|
|
|
|
// to avoid it not being truncated as a partial page request.
|
2022-08-04 08:46:36 +00:00
|
|
|
int overshoot;
|
2015-08-20 03:33:51 +00:00
|
|
|
if (length > 0) {
|
2016-05-21 19:08:03 +00:00
|
|
|
// All Linux kernels (at least up to and including 4.6(.0)) have a bug where
|
|
|
|
// they truncate last partial page of POSIX_FADV_DONTNEED request, so we need
|
|
|
|
// to page-align it ourselves. We'll need the rest of this page on the next
|
|
|
|
// read (assuming this was not EOF).
|
2016-04-15 23:27:44 +00:00
|
|
|
overshoot = (offset + length) & pagemask;
|
|
|
|
} else {
|
|
|
|
// For length == 0 we set overshoot 0, so the below
|
|
|
|
// length - overshoot is 0, which means till end of file for
|
|
|
|
// fadvise. This will cancel the final page and is not part
|
|
|
|
// of the above workaround.
|
|
|
|
overshoot = 0;
|
2016-05-07 16:18:40 +00:00
|
|
|
}
|
2016-04-15 23:27:44 +00:00
|
|
|
|
|
|
|
posix_fadvise(c->fh, offset & ~pagemask, length - overshoot, POSIX_FADV_DONTNEED);
|
2015-04-10 23:09:03 +00:00
|
|
|
#endif
|
2017-03-30 11:09:02 +00:00
|
|
|
|
|
|
|
PyEval_RestoreThread(thread_state);
|
2013-06-22 17:51:07 +00:00
|
|
|
}
|
|
|
|
else {
|
2015-04-08 16:43:53 +00:00
|
|
|
// no os-level file descriptor, use Python file object API
|
|
|
|
data = PyObject_CallMethod(c->fd, "read", "i", n);
|
|
|
|
if(!data) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
n = PyBytes_Size(data);
|
2016-03-31 20:03:17 +00:00
|
|
|
if(PyErr_Occurred()) {
|
|
|
|
// we wanted bytes(), but got something else
|
|
|
|
return 0;
|
|
|
|
}
|
2015-04-08 16:43:53 +00:00
|
|
|
if(n) {
|
|
|
|
memcpy(c->data + c->position + c->remaining, PyBytes_AsString(data), n);
|
|
|
|
c->remaining += n;
|
|
|
|
c->bytes_read += n;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
c->eof = 1;
|
|
|
|
}
|
|
|
|
Py_DECREF(data);
|
2013-06-22 17:51:07 +00:00
|
|
|
}
|
2013-05-28 12:35:55 +00:00
|
|
|
return 1;
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
static PyObject *
|
|
|
|
chunker_process(Chunker *c)
|
2010-02-28 19:22:45 +00:00
|
|
|
{
|
2015-11-21 21:08:30 +00:00
|
|
|
uint32_t sum, chunk_mask = c->chunk_mask;
|
2019-11-09 22:21:08 +00:00
|
|
|
size_t n, old_last, min_size = c->min_size, window_size = c->window_size;
|
2010-11-04 20:19:01 +00:00
|
|
|
|
2010-12-16 21:56:04 +00:00
|
|
|
if(c->done) {
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->bytes_read == c->bytes_yielded)
|
|
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
|
|
else
|
|
|
|
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
|
2010-03-03 21:52:57 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).
Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).
See "Cases" considerations below.
For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.
Cases (before this change)
--------------------------
- A) remaining <= window_size
- would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
- no buzhashing
- result is 1 <remaining> length chunk
- B) window_size < remaining <= min_size:
- the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
- would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
because there is also the n < min_size condition
- would do another chunker_fill call (line 282), but not get more data, so loop ends
- result is 1 <remaining> length chunk
- C) file > min_size:
- normal chunking
Cases (after this change)
-------------------------
- A) similar to above A), but up to remaining < min_size + window_size + 1,
so it does not buzhash if there is no chance for a cut.
- B) see C) above
2016-05-21 21:16:18 +00:00
|
|
|
while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */
|
2013-05-28 12:35:55 +00:00
|
|
|
if(!chunker_fill(c)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).
Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).
See "Cases" considerations below.
For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.
Cases (before this change)
--------------------------
- A) remaining <= window_size
- would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
- no buzhashing
- result is 1 <remaining> length chunk
- B) window_size < remaining <= min_size:
- the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
- would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
because there is also the n < min_size condition
- would do another chunker_fill call (line 282), but not get more data, so loop ends
- result is 1 <remaining> length chunk
- C) file > min_size:
- normal chunking
Cases (after this change)
-------------------------
- A) similar to above A), but up to remaining < min_size + window_size + 1,
so it does not buzhash if there is no chance for a cut.
- B) see C) above
2016-05-21 21:16:18 +00:00
|
|
|
/* here we either are at eof ... */
|
2016-03-31 20:03:17 +00:00
|
|
|
if(c->eof) {
|
2010-12-16 21:56:04 +00:00
|
|
|
c->done = 1;
|
2011-07-17 22:00:03 +00:00
|
|
|
if(c->remaining) {
|
2012-12-18 19:38:16 +00:00
|
|
|
c->bytes_yielded += c->remaining;
|
2016-03-02 13:45:02 +00:00
|
|
|
return PyMemoryView_FromMemory((char *)(c->data + c->position), c->remaining, PyBUF_READ);
|
2011-07-17 22:00:03 +00:00
|
|
|
}
|
|
|
|
else {
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->bytes_read == c->bytes_yielded)
|
|
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
|
|
else
|
|
|
|
PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch");
|
2011-07-17 22:00:03 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2010-12-16 21:56:04 +00:00
|
|
|
}
|
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).
Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).
See "Cases" considerations below.
For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.
Cases (before this change)
--------------------------
- A) remaining <= window_size
- would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
- no buzhashing
- result is 1 <remaining> length chunk
- B) window_size < remaining <= min_size:
- the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
- would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
because there is also the n < min_size condition
- would do another chunker_fill call (line 282), but not get more data, so loop ends
- result is 1 <remaining> length chunk
- C) file > min_size:
- normal chunking
Cases (after this change)
-------------------------
- A) similar to above A), but up to remaining < min_size + window_size + 1,
so it does not buzhash if there is no chance for a cut.
- B) see C) above
2016-05-21 21:16:18 +00:00
|
|
|
/* ... or we have at least min_size + window_size + 1 bytes remaining.
|
|
|
|
* We do not want to "cut" a chunk smaller than min_size and the hash
|
|
|
|
* window starts at the potential cutting place.
|
|
|
|
*/
|
|
|
|
c->position += min_size;
|
|
|
|
c->remaining -= min_size;
|
2013-05-28 12:35:55 +00:00
|
|
|
sum = buzhash(c->data + c->position, window_size, c->table);
|
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).
Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).
See "Cases" considerations below.
For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.
Cases (before this change)
--------------------------
- A) remaining <= window_size
- would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
- no buzhashing
- result is 1 <remaining> length chunk
- B) window_size < remaining <= min_size:
- the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
- would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
because there is also the n < min_size condition
- would do another chunker_fill call (line 282), but not get more data, so loop ends
- result is 1 <remaining> length chunk
- C) file > min_size:
- normal chunking
Cases (after this change)
-------------------------
- A) similar to above A), but up to remaining < min_size + window_size + 1,
so it does not buzhash if there is no chance for a cut.
- B) see C) above
2016-05-21 21:16:18 +00:00
|
|
|
while(c->remaining > c->window_size && (sum & chunk_mask)) {
|
2019-11-09 22:21:08 +00:00
|
|
|
uint8_t *p = c->data + c->position;
|
|
|
|
uint8_t *stop_at = p + c->remaining - window_size;
|
|
|
|
size_t did_bytes;
|
|
|
|
while (p < stop_at && (sum & chunk_mask)) {
|
|
|
|
sum = buzhash_update(sum, p[0], p[window_size],
|
|
|
|
window_size, c->table);
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
did_bytes = p - (c->data + c->position);
|
|
|
|
c->position += did_bytes;
|
|
|
|
c->remaining -= did_bytes;
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->remaining <= window_size) {
|
2013-05-28 12:35:55 +00:00
|
|
|
if(!chunker_fill(c)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2010-03-03 22:27:40 +00:00
|
|
|
}
|
2010-03-03 21:52:57 +00:00
|
|
|
}
|
2012-12-18 19:38:16 +00:00
|
|
|
if(c->remaining <= window_size) {
|
|
|
|
c->position += c->remaining;
|
|
|
|
c->remaining = 0;
|
|
|
|
}
|
2014-03-30 15:43:31 +00:00
|
|
|
old_last = c->last;
|
2010-12-16 21:56:04 +00:00
|
|
|
c->last = c->position;
|
2012-12-18 19:38:16 +00:00
|
|
|
n = c->last - old_last;
|
|
|
|
c->bytes_yielded += n;
|
2016-03-02 13:45:02 +00:00
|
|
|
return PyMemoryView_FromMemory((char *)(c->data + old_last), n, PyBUF_READ);
|
2013-06-03 11:45:48 +00:00
|
|
|
}
|