From 41a4842518fd513e5ec2f490a8bcb5ece16e8693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Sun, 28 Feb 2010 21:34:56 +0100 Subject: [PATCH] Switched to an inter based python implementation of chunkify. --- dedupstore/chunkifier.py | 90 +++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/dedupstore/chunkifier.py b/dedupstore/chunkifier.py index 026481dcd..b8e8203a5 100644 --- a/dedupstore/chunkifier.py +++ b/dedupstore/chunkifier.py @@ -28,12 +28,71 @@ def roll_checksum(sum, remove, add, len): return (s1 & 0xffff) + ((s2 & 0xffff) << 16) +class ChunkifyIter(object): + def __init__(self, fd, chunk_size, chunks): + self.fd = fd + self.chunk_size = chunk_size + self.chunks = chunks + + def __iter__(self): + self.data = '' + self.i = 0 + self.full_sum = True + self.extra = None + self.done = False + return self + + def next(self): + if self.done: + raise StopIteration + if self.extra: + self.done = True + return self.extra + while True: + if len(self.data) - self.i < self.chunk_size: + self.data += self.fd.read(self.chunk_size * 3) + if not self.data: + raise StopIteration + if self.full_sum or len(self.data) - self.i < self.chunk_size: + self.sum = checksum(self.data[self.i:self.i + self.chunk_size]) + self.full_sum = False + self.remove = self.data[self.i] + else: + self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1], + self.chunk_size) + self.remove = self.data[self.i] + if len(self.data) - self.i < self.chunk_size: # EOF? + if len(self.data) > self.chunk_size: + self.extra = self.data[-self.chunk_size:] + return self.data[:len(self.data) - self.chunk_size] + else: + self.done = True + return self.data + elif self.sum in self.chunks: + if self.i > 0: + chunk = self.data[:self.i] + self.data = self.data[self.i:] + else: + chunk = self.data[:self.chunk_size] + self.data = self.data[self.chunk_size:] + self.full_sum = True + self.i = 0 + return chunk + elif self.i == self.chunk_size: + chunk = self.data[:self.chunk_size] + self.data = self.data[self.chunk_size:] + self.i = 0 + return chunk + else: + self.i += 1 + + def chunkify(fd, chunk_size, chunks): """ >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') >>> list(chunkify(fd, 4, {})) ['ABCD', 'EFGH', 'IJ', 'KLMN'] - + >>> fd = StringIO.StringIO('ABCDEFGHIJKLMN') >>> chunks = {44564754: True} # 'BCDE' >>> list(chunkify(fd, 4, chunks)) @@ -49,33 +108,8 @@ def chunkify(fd, chunk_size, chunks): >>> list(chunkify(fd, 4, chunks)) ['ABCD', 'EFGH', 'IJ', 'KLMN'] """ - data = 'X' + fd.read(chunk_size * 3) - i = 1 - sum = checksum(data[:chunk_size]) - while True: - if len(data) - i <= chunk_size * 2: - data += fd.read(chunk_size * 2) - if i == chunk_size + 1: - yield data[1:chunk_size + 1] - i = 1 - data = data[chunk_size:] - if len(data) - i <= chunk_size: # EOF? - if len(data) > chunk_size + 1: - yield data[1:len(data) - chunk_size] - yield data[-chunk_size:] - else: - yield data[1:] - return - sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size) - #print data[i:i + chunk_size], sum - if chunks.get(sum): - if i > 1: - yield data[1:i] - yield data[i:i + chunk_size] - data = data[i + chunk_size - 1:] - i = 0 - sum = checksum(data[:chunk_size]) - i += 1 + return ChunkifyIter(fd, chunk_size, chunks) + if __name__ == '__main__': import StringIO