PR #284 - Merge branch 'sparse_files' into merge

This commit is contained in:
Thomas Waldmann 2015-04-15 16:43:07 +02:00
commit b6ed1c742b
5 changed files with 63 additions and 21 deletions

View File

@ -87,14 +87,14 @@ typedef struct {
} Chunker;
static Chunker *
chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
{
Chunker *c = calloc(sizeof(Chunker), 1);
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->buf_size = 10 * 1024 * 1024;
c->buf_size = max_size;
c->data = malloc(c->buf_size);
c->read_buf = malloc(c->buf_size);
return c;

View File

@ -22,9 +22,12 @@ from attic.helpers import Error, uid2user, user2uid, gid2group, group2gid, \
ITEMS_BUFFER = 1024 * 1024
CHUNK_MIN = 1024
CHUNK_MAX = 10 * 1024 * 1024
WINDOW_SIZE = 0xfff
CHUNK_MASK = 0xffff
ZEROS = b'\0' * CHUNK_MAX
utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
has_mtime_ns = sys.version >= '3.3'
@ -71,7 +74,7 @@ class ChunkBuffer:
self.packer = msgpack.Packer(unicode_errors='surrogateescape')
self.chunks = []
self.key = key
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX,self.key.chunk_seed)
def add(self, item):
self.buffer.write(self.packer.pack(StableDict(item)))
@ -136,7 +139,7 @@ class Archive:
self.pipeline = DownloadPipeline(self.repository, self.key)
if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)
self.chunker = Chunker(WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, CHUNK_MAX, self.key.chunk_seed)
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.time()
@ -285,7 +288,13 @@ class Archive:
with open(path, 'wb') as fd:
ids = [c[0] for c in item[b'chunks']]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
fd.write(data)
if ZEROS.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:
fd.write(data)
pos = fd.tell()
fd.truncate(pos)
fd.flush()
self.restore_attrs(path, item, fd=fd.fileno())
elif stat.S_ISFIFO(mode):

View File

@ -8,7 +8,7 @@ cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct _Chunker "Chunker":
pass
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, uint32_t seed)
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
void chunker_set_fd(_Chunker *chunker, object f, int fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
@ -20,8 +20,8 @@ cdef extern from "_chunker.c":
cdef class Chunker:
cdef _Chunker *chunker
def __cinit__(self, window_size, chunk_mask, min_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, seed & 0xffffffff)
def __cinit__(self, window_size, chunk_mask, min_size, max_size, seed):
self.chunker = chunker_init(window_size, chunk_mask, min_size, max_size, seed & 0xffffffff)
def chunkify(self, fd, fh=-1):
"""

View File

@ -11,7 +11,7 @@ import time
import unittest
from hashlib import sha256
from attic import xattr
from attic.archive import Archive, ChunkBuffer
from attic.archive import Archive, ChunkBuffer, CHUNK_MAX
from attic.archiver import Archiver
from attic.cache import Cache
from attic.crypto import bytes_to_long, num_aes_blocks
@ -206,6 +206,38 @@ class ArchiverTestCase(ArchiverTestCaseBase):
config.write(fd)
return Repository(self.repository_path).id
def test_sparse_file(self):
filename = os.path.join(self.input_path, 'sparse')
content = b'foobar'
hole_size = 5 * CHUNK_MAX # 5 full chunker buffers
with open(filename, 'wb') as fd:
# create a file that has a hole at the beginning and end
fd.seek(hole_size, 1)
fd.write(content)
fd.seek(hole_size, 1)
pos = fd.tell()
fd.truncate(pos)
total_len = hole_size + len(content) + hole_size
st = os.stat(filename)
self.assert_equal(st.st_size, total_len)
if hasattr(st, 'st_blocks'):
self.assert_true(st.st_blocks * 512 < total_len / 10) # is input sparse?
self.attic('init', self.repository_location)
self.attic('create', self.repository_location + '::test', 'input')
with changedir('output'):
self.attic('extract', self.repository_location + '::test')
self.assert_dirs_equal('input', 'output/input')
filename = os.path.join(self.output_path, 'input', 'sparse')
with open(filename, 'rb') as fd:
# check if file contents are as expected
self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
self.assert_equal(fd.read(len(content)), content)
self.assert_equal(fd.read(hole_size), b'\0' * hole_size)
st = os.stat(filename)
self.assert_equal(st.st_size, total_len)
if hasattr(st, 'st_blocks'):
self.assert_true(st.st_blocks * 512 < total_len / 10) # is output sparse?
def test_repository_swap_detection(self):
self.create_test_files()
os.environ['ATTIC_PASSPHRASE'] = 'passphrase'

View File

@ -1,25 +1,26 @@
from attic.chunker import Chunker, buzhash, buzhash_update
from attic.testsuite import AtticTestCase
from attic.archive import CHUNK_MAX
from io import BytesIO
class ChunkerTestCase(AtticTestCase):
def test_chunkify(self):
data = b'0' * 1024 * 1024 * 15 + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(data))]
data = b'0' * int(1.5 * CHUNK_MAX) + b'Y'
parts = [bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 0x3, 2, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 3, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 0).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 1).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(3, 0x3, 4, CHUNK_MAX, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)