2013-05-28 12:35:55 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2019-02-24 14:42:21 +00:00
|
|
|
API_VERSION = '1.2_01'
|
2019-01-05 03:40:25 +00:00
|
|
|
|
2020-12-10 23:34:11 +00:00
|
|
|
import errno
|
2019-01-05 03:40:25 +00:00
|
|
|
import os
|
2020-12-14 22:46:04 +00:00
|
|
|
from collections import namedtuple
|
|
|
|
|
2021-01-14 19:02:18 +00:00
|
|
|
from .constants import CH_DATA, CH_ALLOC, CH_HOLE, MAX_DATA_SIZE, zeros
|
2014-03-18 21:04:08 +00:00
|
|
|
|
2013-05-28 12:35:55 +00:00
|
|
|
from libc.stdlib cimport free
|
|
|
|
|
2017-05-02 16:52:36 +00:00
|
|
|
cdef extern from "_chunker.c":
|
2013-05-28 12:35:55 +00:00
|
|
|
ctypedef int uint32_t
|
2014-08-03 13:04:41 +00:00
|
|
|
ctypedef struct _Chunker "Chunker":
|
2013-05-28 12:35:55 +00:00
|
|
|
pass
|
simple sparse file support, made chunk buffer size flexible
Implemented sparse file support to remove this blocker for people backing up lots of
huge sparse files (like VM images). Attic could not support this use case yet as it would
have restored all files to their fully expanded size, possibly running out of disk space if
the total expanded size would be bigger than the available space.
Please note that this is a very simple implementation of sparse file support - at backup time,
it does not do anything special (it just reads all these zero bytes, chunks, compresses and
encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros
and does a seek on the output file rather than a normal data write, so it creates a hole in
a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes
of multiples of that size (depends also a bit on fs block size, alignment, previously written data).
Special cases like sparse files starting and/or ending with a hole are supported.
Please note that it will currently always create sparse files at restore time if it detects all-zero
chunks.
Also improved:
I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also
existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk
buffer size).
Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily.
2015-04-15 14:29:18 +00:00
|
|
|
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
|
2015-04-08 16:43:53 +00:00
|
|
|
void chunker_set_fd(_Chunker *chunker, object f, int fd)
|
2014-08-03 13:04:41 +00:00
|
|
|
void chunker_free(_Chunker *chunker)
|
|
|
|
object chunker_process(_Chunker *chunker)
|
2013-05-28 12:35:55 +00:00
|
|
|
uint32_t *buzhash_init_table(uint32_t seed)
|
2013-06-01 12:17:16 +00:00
|
|
|
uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
|
|
|
|
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)
|
2013-05-28 12:35:55 +00:00
|
|
|
|
|
|
|
|
2020-12-25 21:04:15 +00:00
|
|
|
# this will be True if Python's seek implementation supports data/holes seeking.
|
|
|
|
# this does not imply that it will actually work on the filesystem,
|
|
|
|
# because the FS also needs to support this.
|
|
|
|
has_seek_hole = hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
|
|
|
|
|
|
|
|
|
2020-12-14 22:46:04 +00:00
|
|
|
_Chunk = namedtuple('_Chunk', 'meta data')
|
|
|
|
_Chunk.__doc__ = """\
|
|
|
|
Chunk namedtuple
|
|
|
|
|
|
|
|
meta is always a dictionary, data depends on allocation.
|
|
|
|
|
2020-12-15 02:28:48 +00:00
|
|
|
data chunk read from a DATA range of a file (not from a sparse hole):
|
|
|
|
meta = {'allocation' = CH_DATA, 'size' = size_of_chunk }
|
2020-12-14 22:46:04 +00:00
|
|
|
data = read_data [bytes or memoryview]
|
|
|
|
|
2020-12-15 02:28:48 +00:00
|
|
|
all-zero chunk read from a DATA range of a file (not from a sparse hole, but detected to be all-zero):
|
|
|
|
meta = {'allocation' = CH_ALLOC, 'size' = size_of_chunk }
|
|
|
|
data = None
|
|
|
|
|
|
|
|
all-zero chunk from a HOLE range of a file (from a sparse hole):
|
|
|
|
meta = {'allocation' = CH_HOLE, 'size' = size_of_chunk }
|
2020-12-14 22:46:04 +00:00
|
|
|
data = None
|
|
|
|
"""
|
|
|
|
|
|
|
|
def Chunk(data, **meta):
|
|
|
|
return _Chunk(meta, data)
|
|
|
|
|
|
|
|
|
2020-12-10 23:34:11 +00:00
|
|
|
def dread(offset, size, fd=None, fh=-1):
|
|
|
|
use_fh = fh >= 0
|
|
|
|
if use_fh:
|
|
|
|
data = os.read(fh, size)
|
|
|
|
if hasattr(os, 'posix_fadvise'):
|
|
|
|
# UNIX only and, in case of block sizes that are not a multiple of the
|
|
|
|
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
|
|
|
|
# see comment/workaround in _chunker.c and borgbackup issue #907.
|
|
|
|
os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
|
|
|
|
return data
|
|
|
|
else:
|
|
|
|
return fd.read(size)
|
|
|
|
|
|
|
|
|
|
|
|
def dseek(amount, whence, fd=None, fh=-1):
|
|
|
|
use_fh = fh >= 0
|
|
|
|
if use_fh:
|
|
|
|
return os.lseek(fh, amount, whence)
|
|
|
|
else:
|
|
|
|
return fd.seek(amount, whence)
|
|
|
|
|
|
|
|
|
|
|
|
def dpos_curr_end(fd=None, fh=-1):
|
2019-01-05 03:40:25 +00:00
|
|
|
"""
|
2020-12-10 23:34:11 +00:00
|
|
|
determine current position, file end position (== file length)
|
|
|
|
"""
|
|
|
|
curr = dseek(0, os.SEEK_CUR, fd, fh)
|
|
|
|
end = dseek(0, os.SEEK_END, fd, fh)
|
|
|
|
dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
|
return curr, end
|
|
|
|
|
2019-01-05 03:40:25 +00:00
|
|
|
|
2020-12-10 23:34:11 +00:00
|
|
|
def sparsemap(fd=None, fh=-1):
|
|
|
|
"""
|
|
|
|
generator yielding a (start, length, is_data) tuple for each range.
|
|
|
|
is_data is indicating data ranges (True) or hole ranges (False).
|
|
|
|
|
|
|
|
note:
|
|
|
|
the map is generated starting from the current seek position (it
|
|
|
|
is not required to be 0 / to be at the start of the file) and
|
|
|
|
work from there up to the end of the file.
|
|
|
|
when the generator is finished, the file pointer position will be
|
|
|
|
reset to where it was before calling this function.
|
|
|
|
"""
|
|
|
|
curr, file_len = dpos_curr_end(fd, fh)
|
|
|
|
start = curr
|
|
|
|
try:
|
|
|
|
whence = os.SEEK_HOLE
|
|
|
|
while True:
|
|
|
|
is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole
|
|
|
|
try:
|
|
|
|
end = dseek(start, whence, fd, fh)
|
|
|
|
except OSError as e:
|
|
|
|
if e.errno == errno.ENXIO:
|
|
|
|
if not is_data and start < file_len:
|
|
|
|
# if there is a hole at the end of a file, we can not find the file end by SEEK_DATA
|
|
|
|
# (because we run into ENXIO), thus we must manually deal with this case:
|
|
|
|
end = file_len
|
|
|
|
yield (start, end - start, is_data)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
# we do not want to yield zero-length ranges with start == end:
|
|
|
|
if end > start:
|
|
|
|
yield (start, end - start, is_data)
|
|
|
|
start = end
|
|
|
|
whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
|
|
|
|
finally:
|
|
|
|
# seek to same position as before calling this function
|
|
|
|
dseek(curr, os.SEEK_SET, fd, fh)
|
|
|
|
|
|
|
|
|
|
|
|
class ChunkerFixed:
|
|
|
|
"""
|
|
|
|
This is a simple chunker for input data with data usually staying at same
|
|
|
|
offset and / or with known block/record sizes:
|
2019-01-05 03:40:25 +00:00
|
|
|
|
|
|
|
- raw disk images
|
|
|
|
- block devices
|
|
|
|
- database files with simple header + fixed-size records layout
|
|
|
|
|
2020-12-10 23:34:11 +00:00
|
|
|
It optionally supports:
|
|
|
|
|
|
|
|
- a header block of different size
|
|
|
|
- using a sparsemap to only read data ranges and seek over hole ranges
|
|
|
|
for sparse files.
|
|
|
|
- using an externally given filemap to only read specific ranges from
|
|
|
|
a file.
|
|
|
|
|
|
|
|
Note: the last block of a data or hole range may be less than the block size,
|
2019-01-05 03:40:25 +00:00
|
|
|
this is supported and not considered to be an error.
|
|
|
|
"""
|
2020-12-10 23:34:11 +00:00
|
|
|
def __init__(self, block_size, header_size=0, sparse=False):
|
2019-01-05 03:40:25 +00:00
|
|
|
self.block_size = block_size
|
|
|
|
self.header_size = header_size
|
2020-12-10 23:34:11 +00:00
|
|
|
# should borg try to do sparse input processing?
|
|
|
|
# whether it actually can be done depends on the input file being seekable.
|
2020-12-25 21:04:15 +00:00
|
|
|
self.try_sparse = sparse and has_seek_hole
|
2021-01-08 19:03:34 +00:00
|
|
|
assert block_size <= len(zeros)
|
2019-01-05 03:40:25 +00:00
|
|
|
|
2020-12-10 23:34:11 +00:00
|
|
|
def chunkify(self, fd=None, fh=-1, fmap=None):
|
2019-01-05 03:40:25 +00:00
|
|
|
"""
|
|
|
|
Cut a file into chunks.
|
|
|
|
|
|
|
|
:param fd: Python file object
|
|
|
|
:param fh: OS-level file handle (if available),
|
|
|
|
defaults to -1 which means not to use OS-level fd.
|
2020-12-10 23:34:11 +00:00
|
|
|
:param fmap: a file map, same format as generated by sparsemap
|
2019-01-05 03:40:25 +00:00
|
|
|
"""
|
2020-12-10 23:34:11 +00:00
|
|
|
if fmap is None:
|
|
|
|
if self.try_sparse:
|
|
|
|
try:
|
|
|
|
if self.header_size > 0:
|
|
|
|
header_map = [(0, self.header_size, True), ]
|
|
|
|
dseek(self.header_size, os.SEEK_SET, fd, fh)
|
|
|
|
body_map = list(sparsemap(fd, fh))
|
|
|
|
dseek(0, os.SEEK_SET, fd, fh)
|
|
|
|
else:
|
|
|
|
header_map = []
|
|
|
|
body_map = list(sparsemap(fd, fh))
|
|
|
|
except OSError as err:
|
|
|
|
# seeking did not work
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
fmap = header_map + body_map
|
|
|
|
|
|
|
|
if fmap is None:
|
|
|
|
# either sparse processing (building the fmap) was not tried or it failed.
|
|
|
|
# in these cases, we just build a "fake fmap" that considers the whole file
|
|
|
|
# as range(s) of data (no holes), so we can use the same code.
|
|
|
|
# we build different fmaps here for the purpose of correct block alignment
|
|
|
|
# with or without a header block (of potentially different size).
|
|
|
|
if self.header_size > 0:
|
|
|
|
header_map = [(0, self.header_size, True), ]
|
|
|
|
body_map = [(self.header_size, 2 ** 62, True), ]
|
|
|
|
else:
|
|
|
|
header_map = []
|
|
|
|
body_map = [(0, 2 ** 62, True), ]
|
|
|
|
fmap = header_map + body_map
|
|
|
|
|
2019-01-05 03:40:25 +00:00
|
|
|
offset = 0
|
2020-12-10 23:34:11 +00:00
|
|
|
for range_start, range_size, is_data in fmap:
|
|
|
|
if range_start != offset:
|
|
|
|
# this is for the case when the fmap does not cover the file completely,
|
|
|
|
# e.g. it could be without the ranges of holes or of unchanged data.
|
|
|
|
offset = range_start
|
|
|
|
dseek(offset, os.SEEK_SET, fd, fh)
|
|
|
|
while range_size:
|
|
|
|
wanted = min(range_size, self.block_size)
|
|
|
|
if is_data:
|
|
|
|
# read block from the range
|
|
|
|
data = dread(offset, wanted, fd, fh)
|
2020-12-14 22:46:04 +00:00
|
|
|
got = len(data)
|
2021-01-08 19:03:34 +00:00
|
|
|
if zeros.startswith(data):
|
2020-12-15 02:28:48 +00:00
|
|
|
data = None
|
|
|
|
is_zero = True
|
|
|
|
else:
|
|
|
|
is_zero = False
|
2020-12-10 23:34:11 +00:00
|
|
|
else: # hole
|
|
|
|
# seek over block from the range
|
|
|
|
pos = dseek(wanted, os.SEEK_CUR, fd, fh)
|
2020-12-14 22:46:04 +00:00
|
|
|
got = pos - offset
|
2020-12-15 02:28:48 +00:00
|
|
|
data = None
|
|
|
|
is_zero = True
|
2020-12-10 23:34:11 +00:00
|
|
|
if got > 0:
|
|
|
|
offset += got
|
|
|
|
range_size -= got
|
2020-12-15 02:28:48 +00:00
|
|
|
yield Chunk(data, size=got, allocation=(CH_ALLOC if is_zero else CH_DATA) if is_data else CH_HOLE)
|
2020-12-10 23:34:11 +00:00
|
|
|
if got < wanted:
|
|
|
|
# we did not get enough data, looks like EOF.
|
|
|
|
return
|
2019-01-05 03:40:25 +00:00
|
|
|
|
|
|
|
|
2014-08-03 13:04:41 +00:00
|
|
|
cdef class Chunker:
|
2019-01-05 03:38:06 +00:00
|
|
|
"""
|
|
|
|
Content-Defined Chunker, variable chunk sizes.
|
|
|
|
|
|
|
|
This chunker does quite some effort to mostly cut the same-content chunks, even if
|
|
|
|
the content moves to a different offset inside the file. It uses the buzhash
|
|
|
|
rolling-hash algorithm to identify the chunk cutting places by looking at the
|
|
|
|
content inside the moving window and computing the rolling hash value over the
|
|
|
|
window contents. If the last n bits of the rolling hash are 0, a chunk is cut.
|
|
|
|
Additionally it obeys some more criteria, like a minimum and maximum chunk size.
|
|
|
|
It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
|
|
|
|
"""
|
2014-08-03 13:04:41 +00:00
|
|
|
cdef _Chunker *chunker
|
2013-05-28 12:35:55 +00:00
|
|
|
|
2015-09-10 21:12:12 +00:00
|
|
|
def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
|
2015-06-20 23:46:41 +00:00
|
|
|
min_size = 1 << chunk_min_exp
|
|
|
|
max_size = 1 << chunk_max_exp
|
chunker: do not buzhash if not needed, fixes #1021
For small remainders of files (last chunk), we do not need to buzhash if it
is already clear that there is not enough left (we want at least min_size big
chunks).
Small files are handled by same code - as they only give 1 chunk, that is
the last chunk (see above).
See "Cases" considerations below.
For big files, we do not need to buzhash the first min_size bytes of a chunk -
we do not want to cut there anyway, so we can start buzhashing at offset
min_size.
Cases (before this change)
--------------------------
- A) remaining <= window_size
- would do 2 chunker_fill calls (both line 253) and trigger eof with the 2nd call
- no buzhashing
- result is 1 <remaining> length chunk
- B) window_size < remaining <= min_size:
- the chunker would do 1 chunker_fill call (line 253) that would read the entire remaining file (but not trigger eof yet)
- would compute all possible remaining - window_size + 1 buzhashes, but without a chance for a cut,
because there is also the n < min_size condition
- would do another chunker_fill call (line 282), but not get more data, so loop ends
- result is 1 <remaining> length chunk
- C) file > min_size:
- normal chunking
Cases (after this change)
-------------------------
- A) similar to above A), but up to remaining < min_size + window_size + 1,
so it does not buzhash if there is no chance for a cut.
- B) see C) above
2016-05-21 21:16:18 +00:00
|
|
|
# see chunker_process, first while loop condition, first term must be able to get True:
|
|
|
|
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
|
2015-06-20 23:46:41 +00:00
|
|
|
hash_mask = (1 << hash_mask_bits) - 1
|
|
|
|
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
|
2014-08-03 13:04:41 +00:00
|
|
|
|
2015-04-08 16:43:53 +00:00
|
|
|
def chunkify(self, fd, fh=-1):
|
|
|
|
"""
|
|
|
|
Cut a file into chunks.
|
|
|
|
|
|
|
|
:param fd: Python file object
|
|
|
|
:param fh: OS-level file handle (if available),
|
|
|
|
defaults to -1 which means not to use OS-level fd.
|
|
|
|
"""
|
|
|
|
chunker_set_fd(self.chunker, fd, fh)
|
2014-08-03 13:04:41 +00:00
|
|
|
return self
|
2013-05-28 12:35:55 +00:00
|
|
|
|
|
|
|
def __dealloc__(self):
|
|
|
|
if self.chunker:
|
|
|
|
chunker_free(self.chunker)
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
2020-12-14 22:46:04 +00:00
|
|
|
data = chunker_process(self.chunker)
|
|
|
|
return Chunk(data, size=len(data), allocation=CH_DATA) # no sparse support here
|
2013-05-28 12:35:55 +00:00
|
|
|
|
|
|
|
|
2019-01-05 03:38:06 +00:00
|
|
|
def get_chunker(algo, *params, **kw):
|
|
|
|
if algo == 'buzhash':
|
|
|
|
seed = kw['seed']
|
|
|
|
return Chunker(seed, *params)
|
2019-01-05 03:40:25 +00:00
|
|
|
if algo == 'fixed':
|
2020-12-10 23:34:11 +00:00
|
|
|
sparse = kw['sparse']
|
|
|
|
return ChunkerFixed(*params, sparse=sparse)
|
2019-01-05 03:38:06 +00:00
|
|
|
raise TypeError('unsupported chunker algo %r' % algo)
|
|
|
|
|
|
|
|
|
2017-06-14 17:16:36 +00:00
|
|
|
def buzhash(data, unsigned long seed):
|
2013-05-28 12:35:55 +00:00
|
|
|
cdef uint32_t *table
|
|
|
|
cdef uint32_t sum
|
|
|
|
table = buzhash_init_table(seed & 0xffffffff)
|
2017-06-14 17:16:36 +00:00
|
|
|
sum = c_buzhash(<const unsigned char *> data, len(data), table)
|
2013-05-28 12:35:55 +00:00
|
|
|
free(table)
|
|
|
|
return sum
|
|
|
|
|
|
|
|
|
|
|
|
def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
|
|
|
|
cdef uint32_t *table
|
|
|
|
table = buzhash_init_table(seed & 0xffffffff)
|
|
|
|
sum = c_buzhash_update(sum, remove, add, len, table)
|
|
|
|
free(table)
|
2015-04-08 16:43:53 +00:00
|
|
|
return sum
|