Switched to using avro serialization for archives

2010-10-17 17:44:41 +02:00 · 2010-10-17 17:44:41 +02:00 · 6eb65d07f9
parent 5cd5d761cd
commit 6eb65d07f9
3 changed files with 91 additions and 30 deletions
--- a/dedupestore/init.py
+++ b/dedupestore/init.py
@ -1 +1,38 @@
-# This is a python package
+# This is a python package
+
+ARCHIVE_SCHEMA = """
+{
+    "name": "Archive",
+    "type": "record",
+    "fields" : [
+        { "name": "name", "type": "string" },
+        { "name": "ts",   "type": "string" },
+        { "name": "chunks", "type": { "type": "array", "items":
+            { "type": "record",
+              "name": "Chunk",
+              "fields": [
+                { "name": "id", "type": {"type": "fixed", "size": 32, "name": "sha256" }},
+                { "name": "size", "type": "int" }
+              ]
+            }
+        }},
+        { "name": "items", "type": {"type": "array", "items":
+            {
+                "type": "record",
+                "name": "Item",
+                "fields": [
+                    { "name": "type", "type":
+                      { "name": "ItemType", "type": "enum", "symbols": ["FILE", "DIRECTORY"] } },
+                    { "name": "path", "type": "string" },
+                    { "name": "size", "type": ["null", "long"] },
+                    { "name": "chunks", "type": ["null",
+                        { "type": "array", "items": "int" }
+                    ]}
+                ]
+            }
+        }}
+    ]
+}
+"""
+from avro import schema
+archive_schema = schema.parse(ARCHIVE_SCHEMA)
--- a/dedupestore/archiver.py
+++ b/dedupestore/archiver.py
@ -2,10 +2,14 @@ import os
 import hashlib
 import logging
 import zlib
-import cPickle
 import argparse
 import sys
+from cStringIO import StringIO
+from datetime import datetime

+from avro import io
+
+from dedupestore import archive_schema
 from chunkifier import chunkify
 from cache import Cache, NS_ARCHIVES, NS_CHUNKS
 from bandstore import BandStore
@ -41,26 +45,39 @@ class Archive(object):
        data = self.store.get(NS_ARCHIVES, id)
        if hashlib.sha256(data).digest() != id:
            raise Exception('Archive hash did not match')
-        archive = cPickle.loads(zlib.decompress(data))
+        buffer = StringIO(zlib.decompress(data))
+        reader = io.DatumReader(archive_schema)
+        decoder = io.BinaryDecoder(buffer)
+        archive = reader.read(decoder)
        self.items = archive['items']
        self.name = archive['name']
        self.chunks = archive['chunks']
-        for i, (id, csize, osize) in enumerate(archive['chunks']):
-            self.chunk_idx[i] = id
+        for i, chunk in enumerate(archive['chunks']):
+            self.chunk_idx[i] = chunk['id']

    def save(self, name):
-        archive = {'name': name, 'items': self.items, 'chunks': self.chunks}
-        data = zlib.compress(cPickle.dumps(archive))
+        archive = {
+            'name': name,
+            'ts': datetime.utcnow().isoformat(),
+            'items': self.items,
+            'chunks': self.chunks
+        }
+        writer = StringIO()
+        encoder = io.BinaryEncoder(writer)
+        datum_writer = io.DatumWriter(archive_schema)
+        datum_writer.write(archive, encoder)
+        data = zlib.compress(writer.getvalue())
+        print 'archive size: %d' % len(data)
        self.id = hashlib.sha256(data).digest()
        self.store.put(NS_ARCHIVES, self.id, data)
        self.store.commit()

-    def add_chunk(self, id, csize, osize):
+    def add_chunk(self, id, size):
        try:
            return self.chunk_idx[id]
        except KeyError:
            idx = len(self.chunks)
-            self.chunks.append((id, csize, osize))
+            self.chunks.append(dict(id=id, size=size))
            self.chunk_idx[id] = idx
            return idx

@ -77,10 +94,10 @@ class Archive(object):
                    chunk_count.setdefault(id, 0)
                    chunk_count[id] += 1
        for id, c in chunk_count.items():
-            count, csize, osize = cache.chunkmap[id]
-            total_csize += csize
+            count, size = cache.chunkmap[id]
+            total_csize += size
            if  c == count:
-                total_usize += csize
+                total_usize += size
        return dict(osize=total_osize, csize=total_csize, usize=total_usize)

    def list(self):
@ -93,7 +110,7 @@ class Archive(object):
            assert item['path'][0] not in ('/', '\\', ':')
            path = os.path.join(dest, item['path'])
            logging.info(path)
-            if item['type'] == 'DIR':
+            if item['type'] == 'DIRECTORY':
                if not os.path.exists(path):
                    os.makedirs(path)
            if item['type'] == 'FILE':
@ -142,7 +159,7 @@ class Archive(object):
        if name in cache.archives:
            raise NameError('Archive already exists')
        for path in paths:
-            for root, dirs, files in os.walk(path):
+            for root, dirs, files in os.walk(unicode(path)):
                for d in dirs:
                    p = os.path.join(root, d)
                    self.items.append(self.process_dir(p, cache))
@ -158,7 +175,7 @@ class Archive(object):
    def process_dir(self, path, cache):
        path = path.lstrip('/\\:')
        logging.info(path)
-        return {'type': 'DIR', 'path': path}
+        return {'type': 'DIRECTORY', 'path': path}

    def process_file(self, path, cache):
        try:
--- a/dedupestore/cache.py
+++ b/dedupestore/cache.py
@ -1,8 +1,11 @@
-import cPickle
 import hashlib
 import os
-import sys
 import zlib
+from avro import io
+from cStringIO import StringIO
+import cPickle
+
+from dedupestore import archive_schema

 NS_ARCHIVES = 'ARCHIVES'
 NS_CHUNKS = 'CHUNKS'
@ -48,13 +51,17 @@ class Cache(object):
            data = self.store.get(NS_ARCHIVES, id)
            if hashlib.sha256(data).digest() != id:
                raise Exception('Archive hash did not match')
-            archive = cPickle.loads(zlib.decompress(data))
+
+            buffer = StringIO(zlib.decompress(data))
+            reader = io.DatumReader(archive_schema)
+            decoder = io.BinaryDecoder(buffer)
+            archive = reader.read(decoder)
            self.archives[archive['name']] = id
-            for id, csize, osize in archive['chunks']:
+            for id, size in archive['chunks']:
                if self.seen_chunk(id):
                    self.chunk_incref(id)
                else:
-                    self.init_chunk(id, csize, osize)
+                    self.init_chunk(id, size)

    def save(self):
        assert self.store.state == self.store.OPEN
@ -78,27 +85,27 @@ class Cache(object):
        data = hashlib.sha256(data).digest() + data
        csize = len(data)
        self.store.put(NS_CHUNKS, id, data)
-        return self.init_chunk(id, csize, osize)
+        return self.init_chunk(id, csize)

-    def init_chunk(self, id, csize, osize):
-        self.chunkmap[id] = (1, csize, osize)
-        return id, csize, osize
+    def init_chunk(self, id, size):
+        self.chunkmap[id] = (1, size)
+        return id, size

    def seen_chunk(self, id):
-        count, csize, osize = self.chunkmap.get(id, (0, 0, 0))
+        count, size = self.chunkmap.get(id, (0, 0))
        return count

    def chunk_incref(self, id):
-        count, csize, osize = self.chunkmap[id]
-        self.chunkmap[id] = (count + 1, csize, osize)
-        return id, csize, osize
+        count, size = self.chunkmap[id]
+        self.chunkmap[id] = (count + 1, size)
+        return id, size

    def chunk_decref(self, id):
-        count, csize, osize = self.chunkmap[id]
+        count, size = self.chunkmap[id]
        if count == 1:
            del self.chunkmap[id]
            self.store.delete(NS_CHUNKS, id)
        else:
-            self.chunkmap[id] = (count - 1, csize, osize)
+            self.chunkmap[id] = (count - 1, size)