diff --git a/dedupstore/archiver.py b/dedupstore/archiver.py index 2d801c152..9c160ca95 100644 --- a/dedupstore/archiver.py +++ b/dedupstore/archiver.py @@ -6,7 +6,7 @@ import struct import cPickle from optparse import OptionParser -from chunker import chunker, checksum +from chunkifier import chunkify, checksum from store import Store @@ -70,12 +70,12 @@ class Cache(object): def add_chunk(self, data): sum = checksum(data) + data = zlib.compress(data) #print 'chunk %d: %d' % (len(data), sum) - hash = struct.pack('I', sum) + hashlib.sha1(data).digest() - if not self.seen_chunk(hash): - zdata = zlib.compress(data) - size = len(zdata) - self.store.put(NS_CHUNKS, hash, zdata) + id = struct.pack('I', sum) + hashlib.sha1(data).digest() + if not self.seen_chunk(id): + size = len(data) + self.store.put(NS_CHUNKS, id, data) else: size = 0 #print 'seen chunk', hash.encode('hex') @@ -164,7 +164,7 @@ class Archiver(object): print item['path'], '...', for chunk in item['chunks']: data = self.store.get(NS_CHUNKS, chunk) - if hashlib.sha1(data).digest() != chunk: + if hashlib.sha1(data).digest() != chunk[4:]: print 'ERROR' break else: @@ -184,7 +184,10 @@ class Archiver(object): if item['type'] == 'FILE': with open(item['path'], 'wb') as fd: for chunk in item['chunks']: - fd.write(zlib.decompress(self.store.get(NS_CHUNKS, chunk))) + data = self.store.get(NS_CHUNKS, chunk) + if hashlib.sha1(data).digest() != chunk[4:]: + raise Exception('Invalid chunk checksum') + fd.write(zlib.decompress(data)) def process_dir(self, path, cache): path = path.lstrip('/\\:') @@ -198,7 +201,7 @@ class Archiver(object): origsize = 0 compsize = 0 chunks = [] - for chunk in chunker(fd, CHUNKSIZE, self.cache.summap): + for chunk in chunkify(fd, CHUNKSIZE, self.cache.summap): origsize += len(chunk) id, size = cache.add_chunk(chunk) compsize += size diff --git a/dedupstore/chunker.py b/dedupstore/chunkifier.py similarity index 90% rename from dedupstore/chunker.py rename to dedupstore/chunkifier.py index 802f533e4..026481dcd 100644 --- a/dedupstore/chunker.py +++ b/dedupstore/chunkifier.py @@ -28,25 +28,25 @@ def roll_checksum(sum, remove, add, len): return (s1 & 0xffff) + ((s2 & 0xffff) << 16) -def chunker(fd, chunk_size, chunks): +def chunkify(fd, chunk_size, chunks): """ >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') - >>> list(chunker(fd, 4, {})) + >>> list(chunkify(fd, 4, {})) ['ABCD', 'EFGH', 'IJ', 'KLMN'] >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') >>> chunks = {44564754: True} # 'BCDE' - >>> list(chunker(fd, 4, chunks)) + >>> list(chunkify(fd, 4, chunks)) ['A', 'BCDE', 'FGHI', 'J', 'KLMN'] >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK' - >>> list(chunker(fd, 4, chunks)) + >>> list(chunkify(fd, 4, chunks)) ['A', 'BCDE', 'FG', 'HIJK', 'LMN'] >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN' - >>> list(chunker(fd, 4, chunks)) + >>> list(chunkify(fd, 4, chunks)) ['ABCD', 'EFGH', 'IJ', 'KLMN'] """ data = 'X' + fd.read(chunk_size * 3) @@ -62,7 +62,7 @@ def chunker(fd, chunk_size, chunks): if len(data) - i <= chunk_size: # EOF? if len(data) > chunk_size + 1: yield data[1:len(data) - chunk_size] - yield data[:chunk_size] + yield data[-chunk_size:] else: yield data[1:] return