borg/dedupestore/chunkifier.py

146 lines
4.6 KiB
Python

def checksum(data, sum=0):
"""Simple but fast checksum that can be updated at either end.
>>> checksum('FOOBAR')
102367679
>>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
True
"""
s1 = sum & 0xffff
s2 = sum >> 16
for c in data:
s1 += ord(c) + 1
s2 += s1
return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
def roll_checksum(sum, remove, add, len):
"""
>>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
True
"""
s1 = sum & 0xffff
s2 = sum >> 16
add = ord(add)
remove = ord(remove)
s1 -= remove - add
s2 -= len * (remove + 1) - s1
return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
class ChunkifyIter(object):
def __init__(self, fd, chunk_size, chunks):
self.fd = fd
self.chunk_size = chunk_size
self.chunks = chunks
def __iter__(self):
self.data = ''
self.i = 0
self.full_sum = True
self.extra = None
self.done = False
self.buf_size = self.chunk_size * 10
return self
def next(self):
o = 0
if self.done:
raise StopIteration
if self.extra:
self.done = True
return self.extra
while True:
if self.i > self.buf_size - self.chunk_size:
self.data = self.data[self.i - o:]
self.i = o
if len(self.data) - self.i < self.chunk_size:
self.data += self.fd.read(self.buf_size - len(self.data))
if len(self.data) == self.i:
raise StopIteration
if len(self.data) - self.i < self.chunk_size: # EOF?
if o == 1:
self.done = True
return self.data[self.i - 1:]
elif o > 1:
self.extra = self.data[-self.chunk_size:]
return self.data[-self.chunk_size - o + 1:-self.chunk_size]
else:
self.done = True
return self.data[self.i:]
elif o == self.chunk_size:
return self.data[self.i-self.chunk_size:self.i]
if self.full_sum or len(self.data) - self.i < self.chunk_size:
self.sum = checksum(self.data[self.i:self.i + self.chunk_size])
self.full_sum = False
self.remove = self.data[self.i]
else:
self.sum = roll_checksum(self.sum, self.remove, self.data[self.i + self.chunk_size - 1],
self.chunk_size)
self.remove = self.data[self.i]
if self.sum in self.chunks:
if o > 0:
chunk = self.data[self.i - o:self.i]
else:
chunk = self.data[self.i:self.i + self.chunk_size]
self.i += self.chunk_size
self.full_sum = True
return chunk
else:
self.i += 1
o += 1
def chunkify(fd, chunk_size, chunks):
"""
>>> list(chunkify(StringIO.StringIO('A'), 4, {}))
['A']
>>> list(chunkify(StringIO.StringIO('AB'), 4, {}))
['AB']
>>> list(chunkify(StringIO.StringIO('ABC'), 4, {}))
['ABC']
>>> list(chunkify(StringIO.StringIO('ABCD'), 4, {}))
['ABCD']
>>> list(chunkify(StringIO.StringIO('ABCDE'), 4, {}))
['A', 'BCDE']
>>> list(chunkify(StringIO.StringIO('ABCDEF'), 4, {}))
['AB', 'CDEF']
>>> list(chunkify(StringIO.StringIO('ABCDEFG'), 4, {}))
['ABC', 'DEFG']
>>> list(chunkify(StringIO.StringIO('ABCDEFGH'), 4, {}))
['ABCD', 'EFGH']
>>> list(chunkify(StringIO.StringIO('ABCDEFGHI'), 4, {}))
['ABCD', 'E', 'FGHI']
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, {}))
['ABCD', 'EFGH', 'IJ', 'KLMN']
>>> chunks = {44564754: True} # 'BCDE'
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
['A', 'BCDE', 'FGHI', 'J', 'KLMN']
>>> chunks = {44564754: True, 48496938: True} # 'BCDE', 'HIJK'
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
['A', 'BCDE', 'FG', 'HIJK', 'LMN']
>>> chunks = {43909390: True, 50463030: True} # 'ABCD', 'KLMN'
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMN'), 4, chunks))
['ABCD', 'EFGH', 'IJ', 'KLMN']
"""
return ChunkifyIter(fd, chunk_size, chunks)
try:
import _speedups
checksum = _speedups.checksum
roll_checksum = _speedups.roll_checksum
py_chunkify = chunkify
chunkify = _speedups.chunkify
except ImportError:
print 'Failed to load _speedups module, things will be slow'
if __name__ == '__main__':
import doctest
doctest.testmod()