Started added suppor for a using variable length chunks.

2025-01-31 03:31:41 +00:00 · 2010-02-27 23:23:39 +01:00 · 2010-02-27 23:23:39 +01:00 · bc959ebb37
commit bc959ebb37
parent e4807b11c9
2 changed files with 110 additions and 10 deletions
--- a/dedupstore/archiver.py
+++ b/dedupstore/archiver.py
@ -5,9 +5,11 @@
 import cPickle
 from optparse import OptionParser

+from chunker import chunker, checksum
 from store import Store

-CHUNKSIZE = 256 * 1024
+
+CHUNKSIZE = 64 * 1024
 NS_ARCHIVES = 'ARCHIVES'
 NS_CHUNKS  = 'CHUNKS'

@ -155,21 +157,34 @@ def verify_archive(self, archive_name):
                else:
                    print 'OK'

+    def extract_archive(self, archive_name):
+        try:
+            archive = cPickle.loads(zlib.decompress(self.store.get(NS_ARCHIVES, archive_name)))
+        except Store.DoesNotExist:
+            raise Exception('Archive "%s" does not exist' % archive_name)
+        for item in archive['items']:
+            assert item['path'][0] not in ('/', '\\', ':')
+            print item['path']
+            if item['type'] == 'DIR':
+                if not os.path.exists(item['path']):
+                    os.makedirs(item['path'])
+            if item['type'] == 'FILE':
+                with open(item['path'], 'wb') as fd:
+                    for chunk in item['chunks']:
+                        fd.write(zlib.decompress(self.store.get(NS_CHUNKS, chunk)))
+
    def process_dir(self, path, cache):
        path = path.lstrip('/\\:')
        print 'Directory: %s' % (path)
        return {'type': 'DIR', 'path': path}

    def process_file(self, path, cache):
-        fd = open(path, 'rb')
-        size = 0
-        chunks = []
-        while True:
-            data = fd.read(CHUNKSIZE)
-            if not data:
-                break
-            size += len(data)
-            chunks.append(cache.add_chunk(zlib.compress(data)))
+        with open(path, 'rb') as fd:
+            size = 0
+            chunks = []
+            for chunk in chunker(fd, CHUNKSIZE, {}):
+                size += len(chunk)
+                chunks.append(cache.add_chunk(zlib.compress(chunk)))
        path = path.lstrip('/\\:')
        print 'File: %s (%d chunks)' % (path, len(chunks))
        return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
@ -208,6 +223,8 @@ def run(self):
            self.list_archive(options.list_archive)
        elif options.verify_archive:
            self.verify_archive(options.verify_archive)
+        elif options.extract_archive:
+            self.extract_archive(options.extract_archive)
        elif options.delete_archive:
            self.delete_archive(options.delete_archive)
        else:
--- a/dedupstore/chunker.py
+++ b/dedupstore/chunker.py
@ -0,0 +1,83 @@
+def checksum(data, sum=0):
+    """Simple but fast checksum that can be updated at either end.
+    
+    >>> checksum('FOOBAR')
+    102367679
+    >>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
+    True
+    """
+    s1 = sum & 0xffff
+    s2 = sum >> 16
+    for c in data:
+        s1 += ord(c) + 1
+        s2 += s1
+    return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
+
+
+def roll_checksum(sum, remove, add, len):
+    """
+    >>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
+    True
+    """
+    s1 = sum & 0xffff
+    s2 = sum >> 16
+    add = ord(add)
+    remove = ord(remove)
+    s1 -= remove - add
+    s2 -= len * (remove + 1) - s1
+    return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
+
+
+def chunker(fd, chunk_size, chunks):
+    """
+    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
+    >>> list(chunker(fd, 4, {}))
+    ['ABCD', 'EFGH', 'IJ', 'KLMN']
+    
+    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
+    >>> chunks = {44564754: True} # 'BCDE'
+    >>> list(chunker(fd, 4, chunks))
+    ['A', 'BCDE', 'FGHI', 'J', 'KLMN']
+
+    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
+    >>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
+    >>> list(chunker(fd, 4, chunks))
+    ['A', 'BCDE', 'FG', 'HIJK', 'LMN']
+
+    >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN')
+    >>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
+    >>> list(chunker(fd, 4, chunks))
+    ['ABCD', 'EFGH', 'IJ', 'KLMN']
+    """
+    data = 'X' + fd.read(chunk_size * 2)
+    i = 1
+    sum = checksum(data[:chunk_size])
+    while True:
+        if len(data) - i - 2 <= chunk_size:
+            data += fd.read(chunk_size * 2)
+        if i == chunk_size + 1:
+            yield data[1:chunk_size + 1]
+            i = 1
+            data = data[chunk_size:]
+        if len(data) - i <= chunk_size:  # EOF?
+            if len(data) > chunk_size + 1:
+                yield data[1:len(data) - chunk_size]
+                yield data[-chunk_size:]
+            else:
+                yield data[1:]
+            return
+        sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
+        #print data[i:i + chunk_size], sum
+        if sum in chunks:
+            if i > 1:
+                yield data[1:i]
+            yield data[i:i + chunk_size]
+            data = data[i + chunk_size - 1:]
+            i = 0
+            sum = checksum(data[:chunk_size])
+        i += 1
+
+if __name__ == '__main__':
+    import StringIO
+    import doctest
+    doctest.testmod()