From 42ff0a850b427bd3c2f1fc7be64460d3cb222445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Mon, 15 Mar 2010 21:23:34 +0100 Subject: [PATCH] Store more chunk metadata in the archive. --- dedupestore/archiver.py | 98 ++++++++++++++++++++++------------------- dedupestore/cache.py | 61 ++++++++++++------------- 2 files changed, 82 insertions(+), 77 deletions(-) diff --git a/dedupestore/archiver.py b/dedupestore/archiver.py index c2e6c39f9..cf1935ea1 100644 --- a/dedupestore/archiver.py +++ b/dedupestore/archiver.py @@ -15,15 +15,28 @@ class Archive(object): def __init__(self, store, name=None): self.store = store self.items = [] + self.chunks = [] + self.chunk_idx = {} if name: self.open(name) + def add_chunk(self, id, sum, csize, osize): + try: + return self.chunk_idx[id] + except KeyError: + idx = len(self.chunks) + self.chunks.append((id, sum, csize, osize)) + self.chunk_idx[id] = idx + return idx + def open(self, name): archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, name))) self.items = archive['items'] + for i, (id, sum, csize, osize) in enumerate(archive['chunks']): + self.chunk_idx[i] = id def save(self, name): - archive = {'name': name, 'items': self.items} + archive = {'name': name, 'items': self.items, 'chunks': self.chunks} self.store.put(NS_ARCHIVES, name, zlib.compress(cPickle.dumps(archive))) self.store.commit() @@ -39,10 +52,14 @@ class Archive(object): if not os.path.exists(item['path']): os.makedirs(item['path']) if item['type'] == 'FILE': + path = item['path'] + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) with open(item['path'], 'wb') as fd: for chunk in item['chunks']: - data = self.store.get(NS_CHUNKS, chunk) - if hashlib.sha1(data).digest() != chunk[4:]: + id = self.chunk_idx[chunk] + data = self.store.get(NS_CHUNKS, id) + if hashlib.sha1(data).digest() != id: raise Exception('Invalid chunk checksum') fd.write(zlib.decompress(data)) @@ -50,9 +67,11 @@ class Archive(object): for item in self.items: if item['type'] == 'FILE': print item['path'], '...', + print self.chunk_idx[0].encode('hex') for chunk in item['chunks']: - data = self.store.get(NS_CHUNKS, chunk) - if hashlib.sha1(data).digest() != chunk[4:]: + id = self.chunk_idx[chunk] + data = self.store.get(NS_CHUNKS, id) + if hashlib.sha1(data).digest() != id: print 'ERROR' break else: @@ -68,28 +87,39 @@ class Archive(object): cache.archives.remove(self.name) cache.save() - -class Archiver(object): - - def create_archive(self, archive_name, paths): - try: - self.store.get(NS_ARCHIVES, archive_name) - except Store.DoesNotExist: - pass - else: - raise Exception('Archive "%s" already exists' % archive_name) - archive = Archive(self.store) + def create(self, name, paths, cache): for path in paths: for root, dirs, files in os.walk(path): for d in dirs: - name = os.path.join(root, d) - archive.items.append(self.process_dir(name, self.cache)) + p = os.path.join(root, d) + self.items.append(self.process_dir(p, cache)) for f in files: - name = os.path.join(root, f) - archive.items.append(self.process_file(name, self.cache)) - archive.save(archive_name) - self.cache.archives.append(archive_name) - self.cache.save() + p = os.path.join(root, f) + self.items.append(self.process_file(p, cache)) + self.save(name) + cache.archives.append(name) + cache.save() + + def process_dir(self, path, cache): + path = path.lstrip('/\\:') + print 'Directory: %s' % (path) + return {'type': 'DIR', 'path': path} + + def process_file(self, path, cache): + with open(path, 'rb') as fd: + path = path.lstrip('/\\:') + print 'Adding: %s...' % path + chunks = [] + for chunk in chunkify(fd, CHUNK_SIZE, cache.summap): + chunks.append(self.add_chunk(*cache.add_chunk(chunk))) + return {'type': 'FILE', 'path': path, 'chunks': chunks} + + +class Archiver(object): + + def create_archive(self, name, paths): + archive = Archive(self.store) + archive.create(name, paths, self.cache) def delete_archive(self, archive_name): archive = Archive(self.store, archive_name) @@ -112,28 +142,6 @@ class Archiver(object): archive = Archive(self.store, archive_name) archive.extract() - def process_dir(self, path, cache): - path = path.lstrip('/\\:') - print 'Directory: %s' % (path) - return {'type': 'DIR', 'path': path} - - def process_file(self, path, cache): - with open(path, 'rb') as fd: - path = path.lstrip('/\\:') - print 'Adding: %s...' % path, - sys.stdout.flush() - origsize = 0 - compsize = 0 - chunks = [] - for chunk in chunkify(fd, CHUNK_SIZE, self.cache.summap): - origsize += len(chunk) - id, size = cache.add_chunk(chunk) - compsize += size - chunks.append(id) - ratio = origsize and compsize * 100 / origsize or 0 - print '(%d chunks: %d%%)' % (len(chunks), ratio) - return {'type': 'FILE', 'path': path, 'size': origsize, 'chunks': chunks} - def run(self): parser = OptionParser() parser.add_option("-s", "--store", dest="store", diff --git a/dedupestore/cache.py b/dedupestore/cache.py index 99d274341..00fafb190 100644 --- a/dedupestore/cache.py +++ b/dedupestore/cache.py @@ -2,7 +2,6 @@ import cPickle import hashlib import os import sys -import struct import zlib from chunkifier import checksum @@ -49,10 +48,11 @@ class Cache(object): for id in self.store.list(NS_ARCHIVES): archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id))) self.archives.append(archive['name']) - for item in archive['items']: - if item['type'] == 'FILE': - for c in item['chunks']: - self.chunk_incref(c) + for id, sum, csize, osize in archive['chunks']: + if self.seen_chunk(id): + self.chunk_incref(id) + else: + self.init_chunk(id, sum, csize, osize) print 'done' def save(self): @@ -71,42 +71,39 @@ class Cache(object): def add_chunk(self, data): sum = checksum(data) + osize = len(data) data = zlib.compress(data) - #print 'chunk %d: %d' % (len(data), sum) - id = struct.pack('I', sum) + hashlib.sha1(data).digest() - if not self.seen_chunk(id): - size = len(data) - self.store.put(NS_CHUNKS, id, data) - else: - size = 0 - #print 'seen chunk', hash.encode('hex') - self.chunk_incref(id) - return id, size + id = hashlib.sha1(data).digest() + if self.seen_chunk(id): + return self.chunk_incref(id) + csize = len(data) + self.store.put(NS_CHUNKS, id, data) + return self.init_chunk(id, sum, csize, osize) - def seen_chunk(self, hash): - return self.chunkmap.get(hash, 0) > 0 + def init_chunk(self, id, sum, csize, osize): + self.chunkmap[id] = (1, sum, osize, csize) + self.summap.setdefault(sum, 1) + return id, sum, csize, osize + + def seen_chunk(self, id): + return id in self.chunkmap def chunk_incref(self, id): - sum = struct.unpack('I', id[:4])[0] - self.chunkmap.setdefault(id, 0) - self.summap.setdefault(sum, 0) - self.chunkmap[id] += 1 + count, sum, csize, osize = self.chunkmap[id] + self.chunkmap[id] = (count + 1, sum, osize, csize) self.summap[sum] += 1 + return id, sum, csize, osize def chunk_decref(self, id): - sum = struct.unpack('I', id[:4])[0] - sumcount = self.summap[sum] - 1 - count = self.chunkmap[id] - 1 - assert sumcount >= 0 - assert count >= 0 - if sumcount: - self.summap[sum] = sumcount - else: + count, sum, csize, osize = self.chunkmap[id] + sumcount = self.summap[sum] + if sumcount == 1: del self.summap[sum] - if count: - self.chunkmap[id] = count else: + self.summap[sum] = sumcount - 1 + if count == 1: del self.chunkmap[id] print 'deleting chunk: ', id.encode('hex') self.store.delete(NS_CHUNKS, id) - return count + else: + self.chunkmap[id] = (count - 1, sum, csize, osize)