Initial rough implementation of chunkification cache

2024-12-25 01:06:50 +00:00 · 2010-10-25 22:31:18 +02:00 · 2010-10-25 22:31:18 +02:00 · e181829365
commit e181829365
parent 00a98082ab
2 changed files with 59 additions and 16 deletions
--- a/dedupestore/archive.py
+++ b/dedupestore/archive.py
@ -243,6 +243,7 @@ def process_symlink(self, path, st):
        })
    def process_file(self, path, st, cache):
        safe_path = path.lstrip('/\\:')
+        # Is it a hard link?
        if st.st_nlink > 1:
            source = self.hard_links.get((st.st_ino, st.st_dev))
            if (st.st_ino, st.st_dev) in self.hard_links:
@ -252,18 +253,34 @@ def process_file(self, path, st, cache):
                return
            else:
                self.hard_links[st.st_ino, st.st_dev] = safe_path
-        try:
-            fd = open(path, 'rb')
-        except IOError, e:
-            logging.error(e)
-            return
-        with fd:
-            logging.info(safe_path)
-            chunks = []
-            size = 0
-            for chunk in chunkify(fd, CHUNK_SIZE, 30):
-                chunks.append(self.process_chunk(chunk, cache))
-                size += len(chunk)
+        logging.info(safe_path)
+        path_hash = self.crypto.id_hash(path.encode('utf-8'))
+        ids, size = cache.file_known_and_unchanged(path_hash, st)
+        if ids is not None:
+            # Make sure all ids are available
+            for id in ids:
+                if not cache.seen_chunk(id):
+                    ids = None
+                    break
+            else:
+                chunks = [self.process_chunk2(id, cache) for id in ids]
+        # Only chunkify the file if needed
+        if ids is None:
+            try:
+                fd = open(path, 'rb')
+            except IOError, e:
+                logging.error(e)
+                return
+            with fd:
+                size = 0
+                ids = []
+                chunks = []
+                for chunk in chunkify(fd, CHUNK_SIZE, 30):
+                    ids.append(self.crypto.id_hash(chunk))
+                    chunks.append(chunk)
+                    size += len(chunk)
+            cache.memorize_file_chunks(path_hash, st, ids)
+            chunks = [self.process_chunk(chunk, cache) for chunk in chunks]
        self.items.append({
            'type': 'FILE', 'path': safe_path, 'chunks': chunks, 'size': size,
            'mode': st.st_mode,
@ -272,6 +289,16 @@ def process_file(self, path, st, cache):
            'ctime': st.st_ctime, 'mtime': st.st_mtime,
        })

+    def process_chunk2(self, id, cache):
+        try:
+            return self.chunk_idx[id]
+        except KeyError:
+            idx = len(self.chunks)
+            size = cache.chunk_incref(id)
+            self.chunks.append((id, size))
+            self.chunk_idx[id] = idx
+            return idx
+
    def process_chunk(self, data, cache):
        id = self.crypto.id_hash(data)
        try:
--- a/dedupestore/cache.py
+++ b/dedupestore/cache.py
@ -14,6 +14,7 @@ def __init__(self, store, crypto):
        self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache',
                                 '%s.cache' % self.store.uuid)
        self.tid = -1
+        self.file_chunks = {}
        self.open()
        if self.tid != self.store.tid:
            self.init(crypto)
@ -22,13 +23,15 @@ def open(self):
        if not os.path.exists(self.path):
            return
        cache = msgpack.unpackb(open(self.path, 'rb').read())
-        version = cache.get('version')
-        if version != 1:
-            logging.error('Unsupported cache version %r' % version)
-            return
+        assert cache['version'] == 1
        if cache['store'] != self.store.uuid:
            raise Exception('Cache UUID mismatch')
        self.chunkmap = cache['chunkmap']
+        # Discard old file_chunks entries
+        for hash, entry in cache['file_chunks'].iteritems():
+            count = entry[0]
+            if count < 8:
+                self.file_chunks[hash] = [count + 1] + list(entry[1:])
        self.tid = cache['tid']

    def init(self, crypto):
@ -56,6 +59,7 @@ def save(self):
                'store': self.store.uuid,
                'chunkmap': self.chunkmap,
                'tid': self.store.tid,
+                'file_chunks': self.file_chunks,
        }
        data = msgpack.packb(cache)
        cachedir = os.path.dirname(self.path)
@ -90,4 +94,16 @@ def chunk_decref(self, id):
        else:
            self.chunkmap[id] = (count - 1, size)

+    def file_known_and_unchanged(self, path_hash, st):
+        entry = self.file_chunks.get(path_hash)
+        if (entry and entry[1] == st.st_ino
+            and entry[2] == st.st_size and entry[3] == st.st_mtime):
+            entry[0] = 0 # reset entry age
+            return entry[4], entry[2]
+        else:
+            return None, 0
+
+    def memorize_file_chunks(self, path_hash, st, ids):
+        # Entry: Age, inode, size, mtime, chunk ids
+        self.file_chunks[path_hash] = 0, st.st_ino, st.st_size, st.st_mtime, ids