borg/attic/chunker.pyx

# -*- coding: utf-8 -*-

API_VERSION = 2

from libc.stdlib cimport free

cdef extern from "_chunker.c":
    ctypedef int uint32_t
    ctypedef struct _Chunker "Chunker":
        pass
    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
    void chunker_set_fd(_Chunker *chunker, object f, int fd)
    void chunker_free(_Chunker *chunker)
    object chunker_process(_Chunker *chunker)
    uint32_t *buzhash_init_table(uint32_t seed)
    uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
    uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)


cdef class Chunker:
    cdef _Chunker *chunker

    def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
        self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)

    def chunkify(self, fd, fh=-1):
        """
        Cut a file into chunks.

        :param fd: Python file object
        :param fh: OS-level file handle (if available),
                   defaults to -1 which means not to use OS-level fd.
        """
        chunker_set_fd(self.chunker, fd, fh)
        return self

    def __dealloc__(self):
        if self.chunker:
            chunker_free(self.chunker)

    def __iter__(self):
        return self

    def __next__(self):
        return chunker_process(self.chunker)


def buzhash(unsigned char *data, unsigned long seed):
    cdef uint32_t *table
    cdef uint32_t sum
    table = buzhash_init_table(seed & 0xffffffff)
    sum = c_buzhash(data, len(data), table)
    free(table)
    return sum


def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
    cdef uint32_t *table
    table = buzhash_init_table(seed & 0xffffffff)
    sum = c_buzhash_update(sum, remove, add, len, table)
    free(table)
    return sum
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`# -- coding: utf-8 --`

Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`API_VERSION = 2`
Add a method to detect out of date binary extension modules 2014-03-18 21:04:08 +00:00
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`from libc.stdlib cimport free`

			`cdef extern from "_chunker.c":`
			`ctypedef int uint32_t`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`ctypedef struct _Chunker "Chunker":`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`pass`
simple sparse file support, made chunk buffer size flexible Implemented sparse file support to remove this blocker for people backing up lots of huge sparse files (like VM images). Attic could not support this use case yet as it would have restored all files to their fully expanded size, possibly running out of disk space if the total expanded size would be bigger than the available space. Please note that this is a very simple implementation of sparse file support - at backup time, it does not do anything special (it just reads all these zero bytes, chunks, compresses and encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros and does a seek on the output file rather than a normal data write, so it creates a hole in a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes of multiples of that size (depends also a bit on fs block size, alignment, previously written data). Special cases like sparse files starting and/or ending with a hole are supported. Please note that it will currently always create sparse files at restore time if it detects all-zero chunks. Also improved: I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk buffer size). Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily. 2015-04-15 14:29:18 +00:00			`_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)`
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`void chunker_set_fd(_Chunker *chunker, object f, int fd)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`void chunker_free(_Chunker *chunker)`
			`object chunker_process(_Chunker *chunker)`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`uint32_t *buzhash_init_table(uint32_t seed)`
Remove const usage from pyx files to not confuse older cython versions. 2013-06-01 12:17:16 +00:00			`uint32_t c_buzhash "buzhash"(unsigned char data, size_t len, uint32_t h)`
			`uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)`
Use Cython for all native code 2013-05-28 12:35:55 +00:00

Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`cdef class Chunker:`
			`cdef _Chunker *chunker`
Use Cython for all native code 2013-05-28 12:35:55 +00:00
simple sparse file support, made chunk buffer size flexible Implemented sparse file support to remove this blocker for people backing up lots of huge sparse files (like VM images). Attic could not support this use case yet as it would have restored all files to their fully expanded size, possibly running out of disk space if the total expanded size would be bigger than the available space. Please note that this is a very simple implementation of sparse file support - at backup time, it does not do anything special (it just reads all these zero bytes, chunks, compresses and encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros and does a seek on the output file rather than a normal data write, so it creates a hole in a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes of multiples of that size (depends also a bit on fs block size, alignment, previously written data). Special cases like sparse files starting and/or ending with a hole are supported. Please note that it will currently always create sparse files at restore time if it detects all-zero chunks. Also improved: I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk buffer size). Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily. 2015-04-15 14:29:18 +00:00			`def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):`
			`self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`def chunkify(self, fd, fh=-1):`
			`"""`
			`Cut a file into chunks.`

			`:param fd: Python file object`
			`:param fh: OS-level file handle (if available),`
			`defaults to -1 which means not to use OS-level fd.`
			`"""`
			`chunker_set_fd(self.chunker, fd, fh)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`return self`
Use Cython for all native code 2013-05-28 12:35:55 +00:00
			`def __dealloc__(self):`
			`if self.chunker:`
			`chunker_free(self.chunker)`

			`def __iter__(self):`
			`return self`

			`def __next__(self):`
			`return chunker_process(self.chunker)`


			`def buzhash(unsigned char *data, unsigned long seed):`
			`cdef uint32_t *table`
			`cdef uint32_t sum`
			`table = buzhash_init_table(seed & 0xffffffff)`
			`sum = c_buzhash(data, len(data), table)`
			`free(table)`
			`return sum`


			`def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):`
			`cdef uint32_t *table`
			`table = buzhash_init_table(seed & 0xffffffff)`
			`sum = c_buzhash_update(sum, remove, add, len, table)`
			`free(table)`
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`return sum`