simple sparse file support, made chunk buffer size flexible

Implemented sparse file support to remove this blocker for people backing up lots of
huge sparse files (like VM images). Attic could not support this use case yet as it would
have restored all files to their fully expanded size, possibly running out of disk space if
the total expanded size would be bigger than the available space.

Please note that this is a very simple implementation of sparse file support - at backup time,
it does not do anything special (it just reads all these zero bytes, chunks, compresses and
encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros
and does a seek on the output file rather than a normal data write, so it creates a hole in
a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes
of multiples of that size (depends also a bit on fs block size, alignment, previously written data).

Special cases like sparse files starting and/or ending with a hole are supported.

Please note that it will currently always create sparse files at restore time if it detects all-zero
chunks.

Also improved:
I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also
existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk
buffer size).

Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily.
This commit is contained in:
Thomas Waldmann 2015-04-15 16:29:18 +02:00
parent bbc8886bfe
commit a2bf2aea22
5 changed files with 64 additions and 22 deletions

View File

@ -85,14 +85,14 @@ typedef struct {
} Chunker;
static Chunker *
chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
{
Chunker *c = calloc(sizeof(Chunker), 1);
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->buf_size = 10 * 1024 * 1024;
c->buf_size = max_size;
c->data = malloc(c->buf_size);
return c;
}

View File

@ -22,9 +22,12 @@ from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \
ITEMS_BUFFER = 1024 * 1024
CHUNK_MIN = 1024
CHUNK_MAX = 10 * 1024 * 1024
WINDOW_SIZE = 0xfff
CHUNK_MASK = 0xffff
ZEROS = b'\0' * CHUNK_MAX
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
has_mtime_ns = sys.version >= '3.3'
@ -71,7 +74,7 @@ class ChunkBuffer:
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = []
self.key = key
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
def add(self, item):
self.buffer.write(self.packer.pack(StableDict(item)))
@ -134,7 +137,7 @@ class Archive:
self.pipeline = DownloadPipeline(self.repository, self.key)
if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.time()
@ -269,7 +272,13 @@ class Archive:
with open(path, 'wb') as fd:
ids = [c[0] for c in item[b'chunks']]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
fd.write(data)
if ZEROS.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:
fd.write(data)
pos = fd.tell()
fd.truncate(pos)
fd.flush()
self.restore_attrs(path, item, fd=fd.fileno())
elif stat.S_ISFIFO(mode):

View File

@ -8,7 +8,7 @@ cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct _Chunker "Chunker":
pass
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
void chunker_set_fd(_Chunker *chunker, object fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
@ -20,8 +20,8 @@ cdef extern from "_chunker.c":
cdef class Chunker:
cdef _Chunker *chunker
def __cinit__(self, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
def chunkify(self, fd):
chunker_set_fd(self.chunker, fd)
@ -52,4 +52,4 @@ def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t
table = buzhash_init_table(seed & 0xffffffff)
sum = c_buzhash_update(sum, remove, add, len, table)
free(table)
return sum
return sum

View File

@ -11,7 +11,7 @@ import time
import unittest
from hashlib import sha256
from attic import xattr
from attic.archive import Archive, ChunkBuffer
from attic.archive import Archive, ChunkBuffer, CHUNK_MAX
from attic.archiver import Archiver
from attic.cache import Cache
from attic.crypto import bytes_to_long, num_aes_blocks
@ -197,6 +197,38 @@ class ArchiverTestCase(ArchiverTestCaseBase):
config.write(fd)
return Repository(self.repository_path).id
def test_sparse_file(self):
filename = os.path.join(self.input_path, 'sparse')
content = b'foobar'
hole_size = 5 * CHUNK_MAX # 5 full chunker buffers
with open(filename, 'wb') as fd:
# create a file that has a hole at the beginning and end
fd.seek(hole_size, 1)
fd.write(content)
fd.seek(hole_size, 1)
pos = fd.tell()
fd.truncate(pos)
total_len = hole_size + len(content) + hole_size
st = os.stat(filename)
self.assert_equal(st.st_size, total_len)
if hasattr(st, 'st_blocks'):
self.assert_true(st.st_blocks * 512 < total_len / 10) # is input sparse?
self.attic('init', self.repository_location)
self.attic('create', self.repository_location + '::test', 'input')
with changedir('output'):
self.attic('extract', self.repository_location + '::test')
self.assert_dirs_equal('input', 'output/input')
filename = os.path.join(self.output_path, 'input', 'sparse')
with open(filename, 'rb') as fd:
# check if file contents are as expected
self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
self.assert_equal(fd.read(len(content)), content)
self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
st = os.stat(filename)
self.assert_equal(st.st_size, total_len)
if hasattr(st, 'st_blocks'):
self.assert_true(st.st_blocks * 512 < total_len / 10) # is output sparse?
def test_repository_swap_detection(self):
self.create_test_files()
os.environ['ATTIC_PASSPHRASE'] = 'passphrase'

View File

@ -1,25 +1,26 @@
from attic.chunker import Chunker, buzhash, buzhash_update
from attic.testsuite import AtticTestCase
from attic.archive import CHUNK_MAX
from io import BytesIO
class ChunkerTestCase(AtticTestCase):
def test_chunkify(self):
data = b'0' * 1024 * 1024 * 15 + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))]
data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)