From 723c636f06435837a160f726c1c13da4b788d1f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Sat, 6 Mar 2010 18:25:35 +0100 Subject: [PATCH] Various code refactoring. --- dedupestore/__init__.py | 1 + dedupestore/archiver.py | 119 +++------------------------------------- dedupestore/cache.py | 112 +++++++++++++++++++++++++++++++++++++ dedupestore/store.py | 14 +++-- 4 files changed, 128 insertions(+), 118 deletions(-) create mode 100644 dedupestore/__init__.py create mode 100644 dedupestore/cache.py diff --git a/dedupestore/__init__.py b/dedupestore/__init__.py new file mode 100644 index 000000000..ae23f852c --- /dev/null +++ b/dedupestore/__init__.py @@ -0,0 +1 @@ +# This is a python package \ No newline at end of file diff --git a/dedupestore/archiver.py b/dedupestore/archiver.py index a88867c69..e7e39f4de 100644 --- a/dedupestore/archiver.py +++ b/dedupestore/archiver.py @@ -2,118 +2,12 @@ import os import sys import hashlib import zlib -import struct import cPickle from optparse import OptionParser -from chunkifier import chunkify, checksum -from store import Store - - -CHUNKSIZE = 64 * 1024 -NS_ARCHIVES = 'ARCHIVES' -NS_CHUNKS = 'CHUNKS' - -class Cache(object): - """Client Side cache - """ - def __init__(self, store): - self.store = store - self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', - '%s.cache' % self.store.uuid) - self.tid = -1 - self.open() - if self.tid != self.store.tid: - self.init() - - def open(self): - if not os.path.exists(self.path): - return - print 'Loading cache: ', self.path, '...' - data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read())) - if data['uuid'] != self.store.uuid: - print >> sys.stderr, 'Cache UUID mismatch' - return - self.chunkmap = data['chunkmap'] - self.summap = data['summap'] - self.archives = data['archives'] - self.tid = data['tid'] - print 'done' - - def init(self): - """Initializes cache by fetching and reading all archive indicies - """ - self.summap = {} - self.chunkmap = {} - self.archives = [] - self.tid = self.store.tid - if self.store.tid == 0: - return - print 'Recreating cache...' - for id in self.store.list(NS_ARCHIVES): - archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id))) - self.archives.append(archive['name']) - for item in archive['items']: - if item['type'] == 'FILE': - for c in item['chunks']: - self.chunk_incref(c) - print 'done' - - def save(self): - assert self.store.state == Store.OPEN - print 'saving cache' - data = {'uuid': self.store.uuid, - 'chunkmap': self.chunkmap, 'summap': self.summap, - 'tid': self.store.tid, 'archives': self.archives} - print 'Saving cache as:', self.path - cachedir = os.path.dirname(self.path) - if not os.path.exists(cachedir): - os.makedirs(cachedir) - with open(self.path, 'wb') as fd: - fd.write(zlib.compress(cPickle.dumps(data))) - print 'done' - - def add_chunk(self, data): - sum = checksum(data) - data = zlib.compress(data) - #print 'chunk %d: %d' % (len(data), sum) - id = struct.pack('I', sum) + hashlib.sha1(data).digest() - if not self.seen_chunk(id): - size = len(data) - self.store.put(NS_CHUNKS, id, data) - else: - size = 0 - #print 'seen chunk', hash.encode('hex') - self.chunk_incref(id) - return id, size - - def seen_chunk(self, hash): - return self.chunkmap.get(hash, 0) > 0 - - def chunk_incref(self, id): - sum = struct.unpack('I', id[:4])[0] - self.chunkmap.setdefault(id, 0) - self.summap.setdefault(sum, 0) - self.chunkmap[id] += 1 - self.summap[sum] += 1 - - def chunk_decref(self, id): - sum = struct.unpack('I', id[:4])[0] - sumcount = self.summap[sum] - 1 - count = self.chunkmap[id] - 1 - assert sumcount >= 0 - assert count >= 0 - if sumcount: - self.summap[sum] = sumcount - else: - del self.summap[sum] - if count: - self.chunkmap[id] = count - else: - del self.chunkmap[id] - print 'deleting chunk: ', id.encode('hex') - self.store.delete(NS_CHUNKS, id) - return count +from chunkifier import chunkify +from cache import Cache +from store import Store, NS_ARCHIVES, NS_CHUNKS, CHUNK_SIZE class Archiver(object): @@ -135,7 +29,7 @@ class Archiver(object): name = os.path.join(root, f) items.append(self.process_file(name, self.cache)) archive = {'name': archive_name, 'items': items} - hash = self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive))) + self.store.put(NS_ARCHIVES, archive_name, zlib.compress(cPickle.dumps(archive))) self.store.commit() self.cache.archives.append(archive_name) self.cache.save() @@ -214,7 +108,7 @@ class Archiver(object): origsize = 0 compsize = 0 chunks = [] - for chunk in chunkify(fd, CHUNKSIZE, self.cache.summap): + for chunk in chunkify(fd, CHUNK_SIZE, self.cache.summap): origsize += len(chunk) id, size = cache.add_chunk(chunk) compsize += size @@ -260,9 +154,10 @@ class Archiver(object): else: self.create_archive(options.create_archive, args) + def main(): archiver = Archiver() archiver.run() if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/dedupestore/cache.py b/dedupestore/cache.py new file mode 100644 index 000000000..99d274341 --- /dev/null +++ b/dedupestore/cache.py @@ -0,0 +1,112 @@ +import cPickle +import hashlib +import os +import sys +import struct +import zlib + +from chunkifier import checksum +from store import Store, NS_ARCHIVES, NS_CHUNKS + + +class Cache(object): + """Client Side cache + """ + + def __init__(self, store): + self.store = store + self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache', + '%s.cache' % self.store.uuid) + self.tid = -1 + self.open() + if self.tid != self.store.tid: + self.init() + + def open(self): + if not os.path.exists(self.path): + return + print 'Loading cache: ', self.path, '...' + data = cPickle.loads(zlib.decompress(open(self.path, 'rb').read())) + if data['uuid'] != self.store.uuid: + print >> sys.stderr, 'Cache UUID mismatch' + return + self.chunkmap = data['chunkmap'] + self.summap = data['summap'] + self.archives = data['archives'] + self.tid = data['tid'] + print 'done' + + def init(self): + """Initializes cache by fetching and reading all archive indicies + """ + self.summap = {} + self.chunkmap = {} + self.archives = [] + self.tid = self.store.tid + if self.store.tid == 0: + return + print 'Recreating cache...' + for id in self.store.list(NS_ARCHIVES): + archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, id))) + self.archives.append(archive['name']) + for item in archive['items']: + if item['type'] == 'FILE': + for c in item['chunks']: + self.chunk_incref(c) + print 'done' + + def save(self): + assert self.store.state == Store.OPEN + print 'saving cache' + data = {'uuid': self.store.uuid, + 'chunkmap': self.chunkmap, 'summap': self.summap, + 'tid': self.store.tid, 'archives': self.archives} + print 'Saving cache as:', self.path + cachedir = os.path.dirname(self.path) + if not os.path.exists(cachedir): + os.makedirs(cachedir) + with open(self.path, 'wb') as fd: + fd.write(zlib.compress(cPickle.dumps(data))) + print 'done' + + def add_chunk(self, data): + sum = checksum(data) + data = zlib.compress(data) + #print 'chunk %d: %d' % (len(data), sum) + id = struct.pack('I', sum) + hashlib.sha1(data).digest() + if not self.seen_chunk(id): + size = len(data) + self.store.put(NS_CHUNKS, id, data) + else: + size = 0 + #print 'seen chunk', hash.encode('hex') + self.chunk_incref(id) + return id, size + + def seen_chunk(self, hash): + return self.chunkmap.get(hash, 0) > 0 + + def chunk_incref(self, id): + sum = struct.unpack('I', id[:4])[0] + self.chunkmap.setdefault(id, 0) + self.summap.setdefault(sum, 0) + self.chunkmap[id] += 1 + self.summap[sum] += 1 + + def chunk_decref(self, id): + sum = struct.unpack('I', id[:4])[0] + sumcount = self.summap[sum] - 1 + count = self.chunkmap[id] - 1 + assert sumcount >= 0 + assert count >= 0 + if sumcount: + self.summap[sum] = sumcount + else: + del self.summap[sum] + if count: + self.chunkmap[id] = count + else: + del self.chunkmap[id] + print 'deleting chunk: ', id.encode('hex') + self.store.delete(NS_CHUNKS, id) + return count diff --git a/dedupestore/store.py b/dedupestore/store.py index 0573953ce..b9ccc001d 100644 --- a/dedupestore/store.py +++ b/dedupestore/store.py @@ -1,16 +1,21 @@ #!/usr/bin/env python import os import fcntl -import hashlib import tempfile import shutil import unittest import uuid +CHUNK_SIZE = 256 * 1024 +NS_ARCHIVES = 'ARCHIVES' +NS_CHUNKS = 'CHUNKS' + + class Store(object): """ """ + class DoesNotExist(KeyError): """""" @@ -173,18 +178,15 @@ class Store(object): raise Store.DoesNotExist('Object does not exist: %s' % hash.encode('hex')) def list(self, ns, prefix='', marker=None, max_keys=1000000): - for x in self.foo(os.path.join(self.path, 'data', ns.encode('hex')), + for x in self._walker(os.path.join(self.path, 'data', ns.encode('hex')), prefix, marker, '', max_keys): yield x - - def foo(self, path, prefix, marker, base, max_keys): + def _walker(self, path, prefix, marker, base, max_keys): n = 0 for name in sorted(os.listdir(path)): if n >= max_keys: return - dirs = [] - names = [] id = name.decode('hex') if os.path.isdir(os.path.join(path, name)): if prefix and not id.startswith(prefix[:len(id)]):