From cf7d12ba5050f2a22bf87838401603f134b3eb22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Wed, 20 Oct 2010 20:28:29 +0200 Subject: [PATCH] Initial cache redesign --- dedupestore/archive.py | 37 +++++++++++++++++++------------------ dedupestore/cache.py | 24 +++++++++--------------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/dedupestore/archive.py b/dedupestore/archive.py index 64b220025..083530f3a 100644 --- a/dedupestore/archive.py +++ b/dedupestore/archive.py @@ -61,18 +61,12 @@ def stats(self, cache): total_osize = 0 total_csize = 0 total_usize = 0 - chunk_count = {} for item in self.items: if item['type'] == 'FILE': total_osize += item['size'] - for idx in item['chunks']: - id = self.chunk_idx[idx] - chunk_count.setdefault(id, 0) - chunk_count[id] += 1 - for id, c in chunk_count.items(): - count, size = cache.chunkmap[id] + for id, size in self.chunks: total_csize += size - if c == count: + if self.cache.seen_chunk(id) == 1: total_usize += size return dict(osize=total_osize, csize=total_csize, usize=total_usize) @@ -124,7 +118,6 @@ def verify(self): for chunk in item['chunks']: id = self.chunk_idx[chunk] data = self.store.get(NS_CHUNKS, id) - data = self.store.get(NS_CHUNKS, id) cid = data[:32] data = data[32:] if (hashlib.sha256(data).digest() != cid): @@ -135,20 +128,17 @@ def verify(self): def delete(self, cache): self.store.delete(NS_ARCHIVES, self.cache.archives[self.name]) - for item in self.items: - if item['type'] == 'FILE': - for c in item['chunks']: - id = self.chunk_idx[c] - cache.chunk_decref(id) + for id, size in self.chunks: + cache.chunk_decref(id) self.store.commit() del cache.archives[self.name] cache.save() - def walk(self, path): + def _walk(self, path): st = os.lstat(path) if stat.S_ISDIR(st.st_mode): for f in os.listdir(path): - for x in self.walk(os.path.join(path, f)): + for x in self._walk(os.path.join(path, f)): yield x else: yield path, st @@ -157,7 +147,7 @@ def create(self, name, paths, cache): if name in cache.archives: raise NameError('Archive already exists') for path in paths: - for path, st in self.walk(unicode(path)): + for path, st in self._walk(unicode(path)): if stat.S_ISDIR(st.st_mode): self.process_dir(path, st) elif stat.S_ISLNK(st.st_mode): @@ -193,8 +183,8 @@ def process_file(self, path, st): chunks = [] size = 0 for chunk in chunkify(fd, CHUNK_SIZE, 30): + chunks.append(self.process_chunk(chunk)) size += len(chunk) - chunks.append(self.add_chunk(*self.cache.add_chunk(chunk))) self.items.append({ 'type': 'FILE', 'path': path, 'chunks': chunks, 'size': size, 'mode': st.st_mode, @@ -203,5 +193,16 @@ def process_file(self, path, st): 'ctime': st.st_ctime, 'mtime': st.st_mtime, }) + def process_chunk(self, data): + id = hashlib.sha256(data).digest() + try: + return self.chunk_idx[id] + except KeyError: + idx = len(self.chunks) + size = self.cache.add_chunk(id, data) + self.chunks.append((id, size)) + self.chunk_idx[idx] = id + return idx + diff --git a/dedupestore/cache.py b/dedupestore/cache.py index e88e0fda4..6da6781c8 100644 --- a/dedupestore/cache.py +++ b/dedupestore/cache.py @@ -49,18 +49,14 @@ def init(self): raise Exception('Archive hash did not match') archive = msgpack.unpackb(zlib.decompress(data)) self.archives[archive['name']] = id - for item in archive['items']: - if item['type'] != 'FILE': - continue - for idx in item['chunks']: - id, size = archive['chunks'][idx] - if self.seen_chunk(id): - self.chunk_incref(id) - else: - self.init_chunk(id, size) + for id, size in archive['chunks']: + try: + count, size = self.chunkmap[id] + self.chunkmap[id] = count + 1, size + except KeyError: + self.chunkmap[id] = 1, size self.save() - def save(self): assert self.store.state == self.store.OPEN data = {'uuid': self.store.uuid, @@ -74,16 +70,14 @@ def save(self): id = hashlib.sha256(data).digest() fd.write(id + data) - def add_chunk(self, data): - id = hashlib.sha256(data).digest() + def add_chunk(self, id, data): if self.seen_chunk(id): return self.chunk_incref(id) - osize = len(data) data = zlib.compress(data) data = hashlib.sha256(data).digest() + data csize = len(data) self.store.put(NS_CHUNKS, id, data) - return self.init_chunk(id, csize) + return self.init_chunk(id, csize)[1] def init_chunk(self, id, size): self.chunkmap[id] = (1, size) @@ -96,7 +90,7 @@ def seen_chunk(self, id): def chunk_incref(self, id): count, size = self.chunkmap[id] self.chunkmap[id] = (count + 1, size) - return id, size + return size def chunk_decref(self, id): count, size = self.chunkmap[id]