borg/src/borg/chunker.pyx

# -*- coding: utf-8 -*-

API_VERSION = '1.2_01'

import os

from libc.stdlib cimport free

cdef extern from "_chunker.c":
    ctypedef int uint32_t
    ctypedef struct _Chunker "Chunker":
        pass
    _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
    void chunker_set_fd(_Chunker *chunker, object f, int fd)
    void chunker_free(_Chunker *chunker)
    object chunker_process(_Chunker *chunker)
    uint32_t *buzhash_init_table(uint32_t seed)
    uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
    uint32_t c_buzhash_update  "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)


class ChunkerFixed:
    """
    Fixed blocksize Chunker, optionally supporting a header block of different size.

    This is a very simple chunker for input data with known block/record sizes:

    - raw disk images
    - block devices
    - database files with simple header + fixed-size records layout

    Note: the last block of the input data may be less than the block size,
          this is supported and not considered to be an error.
    """
    def __init__(self, block_size, header_size=0):
        self.block_size = block_size
        self.header_size = header_size

    def chunkify(self, fd, fh=-1):
        """
        Cut a file into chunks.

        :param fd: Python file object
        :param fh: OS-level file handle (if available),
                   defaults to -1 which means not to use OS-level fd.
        """
        offset = 0
        use_fh = fh >= 0

        if use_fh:
            def read(size):
                nonlocal offset
                data = os.read(fh, size)
                amount = len(data)
                if hasattr(os, 'posix_fadvise'):
                    # UNIX only and, in case of block sizes that are not a multiple of the
                    # system's page size, better be used with a bug fixed linux kernel > 4.6.0,
                    # see comment/workaround in _chunker.c and borgbackup issue #907.
                    os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
                offset += amount
                return data
        else:
            def read(size):
                nonlocal offset
                data = fd.read(size)
                amount = len(data)
                offset += amount
                return data

        if self.header_size > 0:
            data = read(self.header_size)
            if data:
                yield data
        else:
            data = True  # get into next while loop
        while data:
            data = read(self.block_size)
            if data:
                yield data
        # empty data means we are at EOF and we terminate the generator.


cdef class Chunker:
    """
    Content-Defined Chunker, variable chunk sizes.

    This chunker does quite some effort to mostly cut the same-content chunks, even if
    the content moves to a different offset inside the file. It uses the buzhash
    rolling-hash algorithm to identify the chunk cutting places by looking at the
    content inside the moving window and computing the rolling hash value over the
    window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
    Additionally it obeys some more criteria, like a minimum and maximum chunk size.
    It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
    """
    cdef _Chunker *chunker

    def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
        min_size = 1 << chunk_min_exp
        max_size = 1 << chunk_max_exp
        # see chunker_process, first while loop condition, first term must be able to get True:
        assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
        hash_mask = (1 << hash_mask_bits) - 1
        self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)

    def chunkify(self, fd, fh=-1):
        """
        Cut a file into chunks.

        :param fd: Python file object
        :param fh: OS-level file handle (if available),
                   defaults to -1 which means not to use OS-level fd.
        """
        chunker_set_fd(self.chunker, fd, fh)
        return self

    def __dealloc__(self):
        if self.chunker:
            chunker_free(self.chunker)

    def __iter__(self):
        return self

    def __next__(self):
        return chunker_process(self.chunker)


def get_chunker(algo, *params, **kw):
    if algo == 'buzhash':
        seed = kw['seed']
        return Chunker(seed, *params)
    if algo == 'fixed':
        return ChunkerFixed(*params)
    raise TypeError('unsupported chunker algo %r' % algo)


def max_chunk_size(algo, *params):
    # see also parseformat.ChunkerParams return values
    if algo == 'buzhash':
        return 1 << params[1]
    if algo == 'fixed':
        return max(params[0], params[1])
    raise TypeError('unsupported chunker algo %r' % algo)


def buzhash(data, unsigned long seed):
    cdef uint32_t *table
    cdef uint32_t sum
    table = buzhash_init_table(seed & 0xffffffff)
    sum = c_buzhash(<const unsigned char *> data, len(data), table)
    free(table)
    return sum


def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
    cdef uint32_t *table
    table = buzhash_init_table(seed & 0xffffffff)
    sum = c_buzhash_update(sum, remove, add, len, table)
    free(table)
    return sum
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`# -- coding: utf-8 --`

bump API_VERSIONs to 1.2_xx 2019-02-24 14:42:21 +00:00			`API_VERSION = '1.2_01'`
add fixed blocksize chunker, fixes #1086 2019-01-05 03:40:25 +00:00
			`import os`
Add a method to detect out of date binary extension modules 2014-03-18 21:04:08 +00:00
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`from libc.stdlib cimport free`

Revert "move chunker to borg.algorithms" This reverts commit 956b50b29cb6d3aec6b7a02d46325e8ab50bc149. # Conflicts: # setup.py # src/borg/archive.py # src/borg/helpers.py 2017-05-02 16:52:36 +00:00			`cdef extern from "_chunker.c":`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`ctypedef int uint32_t`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`ctypedef struct _Chunker "Chunker":`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`pass`
simple sparse file support, made chunk buffer size flexible Implemented sparse file support to remove this blocker for people backing up lots of huge sparse files (like VM images). Attic could not support this use case yet as it would have restored all files to their fully expanded size, possibly running out of disk space if the total expanded size would be bigger than the available space. Please note that this is a very simple implementation of sparse file support - at backup time, it does not do anything special (it just reads all these zero bytes, chunks, compresses and encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros and does a seek on the output file rather than a normal data write, so it creates a hole in a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes of multiples of that size (depends also a bit on fs block size, alignment, previously written data). Special cases like sparse files starting and/or ending with a hole are supported. Please note that it will currently always create sparse files at restore time if it detects all-zero chunks. Also improved: I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk buffer size). Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily. 2015-04-15 14:29:18 +00:00			`_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)`
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`void chunker_set_fd(_Chunker *chunker, object f, int fd)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`void chunker_free(_Chunker *chunker)`
			`object chunker_process(_Chunker *chunker)`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`uint32_t *buzhash_init_table(uint32_t seed)`
Remove const usage from pyx files to not confuse older cython versions. 2013-06-01 12:17:16 +00:00			`uint32_t c_buzhash "buzhash"(unsigned char data, size_t len, uint32_t h)`
			`uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)`
Use Cython for all native code 2013-05-28 12:35:55 +00:00

add fixed blocksize chunker, fixes #1086 2019-01-05 03:40:25 +00:00			`class ChunkerFixed:`
			`"""`
			`Fixed blocksize Chunker, optionally supporting a header block of different size.`

			`This is a very simple chunker for input data with known block/record sizes:`

			`- raw disk images`
			`- block devices`
			`- database files with simple header + fixed-size records layout`

			`Note: the last block of the input data may be less than the block size,`
			`this is supported and not considered to be an error.`
			`"""`
			`def __init__(self, block_size, header_size=0):`
			`self.block_size = block_size`
			`self.header_size = header_size`

			`def chunkify(self, fd, fh=-1):`
			`"""`
			`Cut a file into chunks.`

			`:param fd: Python file object`
			`:param fh: OS-level file handle (if available),`
			`defaults to -1 which means not to use OS-level fd.`
			`"""`
			`offset = 0`
			`use_fh = fh >= 0`

			`if use_fh:`
			`def read(size):`
			`nonlocal offset`
			`data = os.read(fh, size)`
			`amount = len(data)`
			`if hasattr(os, 'posix_fadvise'):`
			`# UNIX only and, in case of block sizes that are not a multiple of the`
			`# system's page size, better be used with a bug fixed linux kernel > 4.6.0,`
			`# see comment/workaround in _chunker.c and borgbackup issue #907.`
			`os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)`
			`offset += amount`
			`return data`
			`else:`
			`def read(size):`
			`nonlocal offset`
			`data = fd.read(size)`
			`amount = len(data)`
			`offset += amount`
			`return data`

			`if self.header_size > 0:`
			`data = read(self.header_size)`
			`if data:`
			`yield data`
			`else:`
			`data = True # get into next while loop`
			`while data:`
			`data = read(self.block_size)`
			`if data:`
			`yield data`
			`# empty data means we are at EOF and we terminate the generator.`


Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`cdef class Chunker:`
prepare to support multiple chunkers 2019-01-05 03:38:06 +00:00			`"""`
			`Content-Defined Chunker, variable chunk sizes.`

			`This chunker does quite some effort to mostly cut the same-content chunks, even if`
			`the content moves to a different offset inside the file. It uses the buzhash`
			`rolling-hash algorithm to identify the chunk cutting places by looking at the`
			`content inside the moving window and computing the rolling hash value over the`
			`window contents. If the last n bits of the rolling hash are 0, a chunk is cut.`
			`Additionally it obeys some more criteria, like a minimum and maximum chunk size.`
			`It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.`
			`"""`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`cdef _Chunker *chunker`
Use Cython for all native code 2013-05-28 12:35:55 +00:00
cython code: add some int types to get rid of unspecific python add / subtract operations they somehow pull in some floating point error code that led to a undefined symbol FPE_... when using the borgbackup wheel on some non-ubuntu/debian linux platform. 2015-09-10 21:12:12 +00:00			`def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):`
misc chunker parameter changes - use power-of-2 sizes / n bit hash mask so one can give them more easily - chunker api: give seed first, so we can give *chunker_params after it - fix some tests that aren't possible with 2^N - make sparse file extraction zero detection flexible for variable chunk max size 2015-06-20 23:46:41 +00:00			`min_size = 1 << chunk_min_exp`
			`max_size = 1 << chunk_max_exp`
chunker: do not buzhash if not needed, fixes #1021 For small remainders of files (last chunk), we do not need to buzhash if it is already clear that there is not enough left (we want at least min_size big chunks). Small files are handled by same code - as they only give 1 chunk, that is the last chunk (see above). See "Cases" considerations below. For big files, we do not need to buzhash the first min_size bytes of a chunk - we do not want to cut there anyway, so we can start buzhashing at offset min_size. Cases (before this change) -------------------------- - A) remaining <= window_size - would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call - no buzhashing - result is 1 <remaining> length chunk - B) window_size < remaining <= min_size: - the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet) - would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut, because there is also the n < min_size condition - would do another chunker_fill call (line 282), but not get more data, so loop ends - result is 1 <remaining> length chunk - C) file > min_size: - normal chunking Cases (after this change) ------------------------- - A) similar to above A), but up to remaining < min_size + window_size + 1, so it does not buzhash if there is no chance for a cut. - B) see C) above 2016-05-21 21:16:18 +00:00			`# see chunker_process, first while loop condition, first term must be able to get True:`
			`assert hash_window_size + min_size + 1 <= max_size, "too small max_size"`
misc chunker parameter changes - use power-of-2 sizes / n bit hash mask so one can give them more easily - chunker api: give seed first, so we can give *chunker_params after it - fix some tests that aren't possible with 2^N - make sparse file extraction zero detection flexible for variable chunk max size 2015-06-20 23:46:41 +00:00			`hash_mask = (1 << hash_mask_bits) - 1`
			`self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`def chunkify(self, fd, fh=-1):`
			`"""`
			`Cut a file into chunks.`

			`:param fd: Python file object`
			`:param fh: OS-level file handle (if available),`
			`defaults to -1 which means not to use OS-level fd.`
			`"""`
			`chunker_set_fd(self.chunker, fd, fh)`
Reuse chunker buffer between files. 2014-08-03 13:04:41 +00:00			`return self`
Use Cython for all native code 2013-05-28 12:35:55 +00:00
			`def __dealloc__(self):`
			`if self.chunker:`
			`chunker_free(self.chunker)`

			`def __iter__(self):`
			`return self`

			`def __next__(self):`
			`return chunker_process(self.chunker)`


prepare to support multiple chunkers 2019-01-05 03:38:06 +00:00			`def get_chunker(algo, params, *kw):`
			`if algo == 'buzhash':`
			`seed = kw['seed']`
			`return Chunker(seed, *params)`
add fixed blocksize chunker, fixes #1086 2019-01-05 03:40:25 +00:00			`if algo == 'fixed':`
			`return ChunkerFixed(*params)`
prepare to support multiple chunkers 2019-01-05 03:38:06 +00:00			`raise TypeError('unsupported chunker algo %r' % algo)`


			`def max_chunk_size(algo, *params):`
			`# see also parseformat.ChunkerParams return values`
			`if algo == 'buzhash':`
			`return 1 << params[1]`
add fixed blocksize chunker, fixes #1086 2019-01-05 03:40:25 +00:00			`if algo == 'fixed':`
			`return max(params[0], params[1])`
prepare to support multiple chunkers 2019-01-05 03:38:06 +00:00			`raise TypeError('unsupported chunker algo %r' % algo)`


chunker: fix invalid use of types With the argument specified as unsigned char , Cython emits code in the Python wrapper to convert string-like objects to unsigned char (essentially PyBytes_AS_STRING). Because the len(data) call is performed on a cdef'd string-ish type, Cython emits a strlen() call, on the result of PyBytes_AS_STRING. This is not correct, since embedded null bytes are entirely possible. Incidentally, the code generated by Cython was also not correct, since the Clang Static Analyzer found a path of execution where passing arguments in a weird way from Python resulted in strlen(NULL). Formulated like this, Cython emits essentially: c_buzhash( PyBytes_AS_STRING(data), PyObject_Length(data), ... ) which is correct. 2017-06-14 17:16:36 +00:00			`def buzhash(data, unsigned long seed):`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`cdef uint32_t *table`
			`cdef uint32_t sum`
			`table = buzhash_init_table(seed & 0xffffffff)`
chunker: fix invalid use of types With the argument specified as unsigned char , Cython emits code in the Python wrapper to convert string-like objects to unsigned char (essentially PyBytes_AS_STRING). Because the len(data) call is performed on a cdef'd string-ish type, Cython emits a strlen() call, on the result of PyBytes_AS_STRING. This is not correct, since embedded null bytes are entirely possible. Incidentally, the code generated by Cython was also not correct, since the Clang Static Analyzer found a path of execution where passing arguments in a weird way from Python resulted in strlen(NULL). Formulated like this, Cython emits essentially: c_buzhash( PyBytes_AS_STRING(data), PyObject_Length(data), ... ) which is correct. 2017-06-14 17:16:36 +00:00			`sum = c_buzhash(<const unsigned char *> data, len(data), table)`
Use Cython for all native code 2013-05-28 12:35:55 +00:00			`free(table)`
			`return sum`


			`def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):`
			`cdef uint32_t *table`
			`table = buzhash_init_table(seed & 0xffffffff)`
			`sum = c_buzhash_update(sum, remove, add, len, table)`
			`free(table)`
let chunker optionally work with os-level file descriptor this safes some back-and-forth between C and Python code and also some memory management overhead as we can always reuse the same read_buf instead of letting Python allocate and free a up to 10MB big buffer for each buffer filling read. we can't use os-level file descriptors all the time though, as chunkify gets also invoked on objects like BytesIO that are not backed by a os-level file. Note: this changeset is also a preparation for O_DIRECT support which can be implemented a lot easier on C level. 2015-04-08 16:43:53 +00:00			`return sum`