From a2bf2aea22c08301e55eafe06474f2c4b7712608 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 15 Apr 2015 16:29:18 +0200 Subject: [PATCH] simple sparse file support, made chunk buffer size flexible Implemented sparse file support to remove this blocker for people backing up lots of huge sparse files (like VM images). Attic could not support this use case yet as it would have restored all files to their fully expanded size, possibly running out of disk space if the total expanded size would be bigger than the available space. Please note that this is a very simple implementation of sparse file support - at backup time, it does not do anything special (it just reads all these zero bytes, chunks, compresses and encrypts them as usual). At restore time, it detects chunks that are completely filled with zeros and does a seek on the output file rather than a normal data write, so it creates a hole in a sparse file. The chunk size for these all-zero chunks is currently 10MiB, so it'll create holes of multiples of that size (depends also a bit on fs block size, alignment, previously written data). Special cases like sparse files starting and/or ending with a hole are supported. Please note that it will currently always create sparse files at restore time if it detects all-zero chunks. Also improved: I needed a constant for the max. chunk size, so I introduced CHUNK_MAX (see also existing CHUNK_MIN) for the maximum chunk size (which is the same as the chunk buffer size). Attic still always uses 10MiB chunk buffer size now, but it could be changed now more easily. --- attic/_chunker.c | 4 ++-- attic/archive.py | 15 ++++++++++++--- attic/chunker.pyx | 8 ++++---- attic/testsuite/archiver.py | 34 +++++++++++++++++++++++++++++++++- attic/testsuite/chunker.py | 25 +++++++++++++------------ 5 files changed, 64 insertions(+), 22 deletions(-) diff --git a/attic/_chunker.c b/attic/_chunker.c index 94d4e47ae..f384a56b6 100644 --- a/attic/_chunker.c +++ b/attic/_chunker.c @@ -85,14 +85,14 @@ typedef struct { } Chunker; static Chunker * -chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) +chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed) { Chunker *c = calloc(sizeof(Chunker), 1); c->window_size = window_size; c->chunk_mask = chunk_mask; c->min_size = min_size; c->table = buzhash_init_table(seed); - c->buf_size = 10 * 1024 * 1024; + c->buf_size = max_size; c->data = malloc(c->buf_size); return c; } diff --git a/attic/archive.py b/attic/archive.py index d78a7fdb3..b637d7f1e 100644 --- a/attic/archive.py +++ b/attic/archive.py @@ -22,9 +22,12 @@ from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \ ITEMS_BUFFER = 1024 * 1024 CHUNK_MIN = 1024 +CHUNK_MAX = 10 * 1024 * 1024 WINDOW_SIZE = 0xfff CHUNK_MASK = 0xffff +ZEROS = b'\0' * CHUNK_MAX + utime_supports_fd = os.utime in getattr(os, 'supports_fd', {}) utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {}) has_mtime_ns = sys.version >= '3.3' @@ -71,7 +74,7 @@ class ChunkBuffer: self.packer = msgpack.Packer(unicode_errors='surrogateescape') self.chunks = [] self.key = key - self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed) def add(self, item): self.buffer.write(self.packer.pack(StableDict(item))) @@ -134,7 +137,7 @@ class Archive: self.pipeline = DownloadPipeline(self.repository, self.key) if create: self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats) - self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed) + self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed) if name in manifest.archives: raise self.AlreadyExists(name) self.last_checkpoint = time.time() @@ -269,7 +272,13 @@ class Archive: with open(path, 'wb') as fd: ids = [c[0] for c in item[b'chunks']] for data in self.pipeline.fetch_many(ids, is_preloaded=True): - fd.write(data) + if ZEROS.startswith(data): + # all-zero chunk: create a hole in a sparse file + fd.seek(len(data), 1) + else: + fd.write(data) + pos = fd.tell() + fd.truncate(pos) fd.flush() self.restore_attrs(path, item, fd=fd.fileno()) elif stat.S_ISFIFO(mode): diff --git a/attic/chunker.pyx b/attic/chunker.pyx index 44ec31fc7..10a6adae3 100644 --- a/attic/chunker.pyx +++ b/attic/chunker.pyx @@ -8,7 +8,7 @@ cdef extern from "_chunker.c": ctypedef int uint32_t ctypedef struct _Chunker "Chunker": pass - _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed) + _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed) void chunker_set_fd(_Chunker *chunker, object fd) void chunker_free(_Chunker *chunker) object chunker_process(_Chunker *chunker) @@ -20,8 +20,8 @@ cdef extern from "_chunker.c": cdef class Chunker: cdef _Chunker *chunker - def __cinit__(self, window_size, chunk_mask, min_size, seed): - self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff) + def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed): + self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff) def chunkify(self, fd): chunker_set_fd(self.chunker, fd) @@ -52,4 +52,4 @@ def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t table = buzhash_init_table(seed & 0xffffffff) sum = c_buzhash_update(sum, remove, add, len, table) free(table) - return sum \ No newline at end of file + return sum diff --git a/attic/testsuite/archiver.py b/attic/testsuite/archiver.py index c115b460f..b9743fd58 100644 --- a/attic/testsuite/archiver.py +++ b/attic/testsuite/archiver.py @@ -11,7 +11,7 @@ import time import unittest from hashlib import sha256 from attic import xattr -from attic.archive import Archive, ChunkBuffer +from attic.archive import Archive, ChunkBuffer, CHUNK_MAX from attic.archiver import Archiver from attic.cache import Cache from attic.crypto import bytes_to_long, num_aes_blocks @@ -197,6 +197,38 @@ class ArchiverTestCase(ArchiverTestCaseBase): config.write(fd) return Repository(self.repository_path).id + def test_sparse_file(self): + filename = os.path.join(self.input_path, 'sparse') + content = b'foobar' + hole_size = 5 * CHUNK_MAX # 5 full chunker buffers + with open(filename, 'wb') as fd: + # create a file that has a hole at the beginning and end + fd.seek(hole_size, 1) + fd.write(content) + fd.seek(hole_size, 1) + pos = fd.tell() + fd.truncate(pos) + total_len = hole_size + len(content) + hole_size + st = os.stat(filename) + self.assert_equal(st.st_size, total_len) + if hasattr(st, 'st_blocks'): + self.assert_true(st.st_blocks * 512 < total_len / 10) # is input sparse? + self.attic('init', self.repository_location) + self.attic('create', self.repository_location + '::test', 'input') + with changedir('output'): + self.attic('extract', self.repository_location + '::test') + self.assert_dirs_equal('input', 'output/input') + filename = os.path.join(self.output_path, 'input', 'sparse') + with open(filename, 'rb') as fd: + # check if file contents are as expected + self.assert_equal(fd.read(hole_size), b'\0' * hole_size) + self.assert_equal(fd.read(len(content)), content) + self.assert_equal(fd.read(hole_size), b'\0' * hole_size) + st = os.stat(filename) + self.assert_equal(st.st_size, total_len) + if hasattr(st, 'st_blocks'): + self.assert_true(st.st_blocks * 512 < total_len / 10) # is output sparse? + def test_repository_swap_detection(self): self.create_test_files() os.environ['ATTIC_PASSPHRASE'] = 'passphrase' diff --git a/attic/testsuite/chunker.py b/attic/testsuite/chunker.py index 2e666265a..90c4a8c50 100644 --- a/attic/testsuite/chunker.py +++ b/attic/testsuite/chunker.py @@ -1,25 +1,26 @@ from attic.chunker import Chunker, buzhash, buzhash_update from attic.testsuite import AtticTestCase +from attic.archive import CHUNK_MAX from io import BytesIO class ChunkerTestCase(AtticTestCase): def test_chunkify(self): - data = b'0' * 1024 * 1024 * 15 + b'Y' - parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))] + data = b'0' * int(1.5 * CHUNK_MAX) + b'Y' + parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))] self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], []) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) - self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) - self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], []) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) + self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3]) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) + self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz']) def test_buzhash(self): self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)