diff --git a/dedupstore/archiver.py b/dedupstore/archiver.py index e930fade..2d801c15 100644 --- a/dedupstore/archiver.py +++ b/dedupstore/archiver.py @@ -73,11 +73,14 @@ class Cache(object): #print 'chunk %d: %d' % (len(data), sum) hash = struct.pack('I', sum) + hashlib.sha1(data).digest() if not self.seen_chunk(hash): - self.store.put(NS_CHUNKS, hash, zlib.compress(data)) + zdata = zlib.compress(data) + size = len(zdata) + self.store.put(NS_CHUNKS, hash, zdata) else: - print 'seen chunk', hash.encode('hex') + size = 0 + #print 'seen chunk', hash.encode('hex') self.chunk_incref(hash) - return hash + return hash, size def seen_chunk(self, hash): return self.chunkmap.get(hash, 0) > 0 @@ -189,14 +192,20 @@ class Archiver(object): return {'type': 'DIR', 'path': path} def process_file(self, path, cache): + print 'Adding: %s...' % path, + sys.stdout.flush() with open(path, 'rb') as fd: - size = 0 + origsize = 0 + compsize = 0 chunks = [] for chunk in chunker(fd, CHUNKSIZE, self.cache.summap): - size += len(chunk) - chunks.append(cache.add_chunk(chunk)) + origsize += len(chunk) + id, size = cache.add_chunk(chunk) + compsize += size + chunks.append(id) path = path.lstrip('/\\:') - print 'File: %s (%d chunks)' % (path, len(chunks)) + ratio = origsize and compsize * 100 / origsize or 0 + print '(%d chunks: %d%%)' % (len(chunks), ratio) return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks} def run(self): diff --git a/dedupstore/chunker.py b/dedupstore/chunker.py index 4d5c17df..802f533e 100644 --- a/dedupstore/chunker.py +++ b/dedupstore/chunker.py @@ -49,11 +49,11 @@ def chunker(fd, chunk_size, chunks): >>> list(chunker(fd, 4, chunks)) ['ABCD', 'EFGH', 'IJ', 'KLMN'] """ - data = 'X' + fd.read(chunk_size * 2) + data = 'X' + fd.read(chunk_size * 3) i = 1 sum = checksum(data[:chunk_size]) while True: - if len(data) - i - 2 <= chunk_size: + if len(data) - i <= chunk_size * 2: data += fd.read(chunk_size * 2) if i == chunk_size + 1: yield data[1:chunk_size + 1] @@ -62,14 +62,13 @@ def chunker(fd, chunk_size, chunks): if len(data) - i <= chunk_size: # EOF? if len(data) > chunk_size + 1: yield data[1:len(data) - chunk_size] - yield data[-chunk_size:] + yield data[:chunk_size] else: yield data[1:] return sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size) #print data[i:i + chunk_size], sum if chunks.get(sum): - print 'Woot', i if i > 1: yield data[1:i] yield data[i:i + chunk_size]