Some chunker fixes.

This commit is contained in:
Jonas Borgström 2010-02-28 12:03:37 +01:00
parent 9f955bf9fb
commit 0cca830981
2 changed files with 19 additions and 11 deletions

View File

@ -73,11 +73,14 @@ class Cache(object):
#print 'chunk %d: %d' % (len(data), sum)
hash = struct.pack('I', sum) + hashlib.sha1(data).digest()
if not self.seen_chunk(hash):
self.store.put(NS_CHUNKS, hash, zlib.compress(data))
zdata = zlib.compress(data)
size = len(zdata)
self.store.put(NS_CHUNKS, hash, zdata)
else:
print 'seen chunk', hash.encode('hex')
size = 0
#print 'seen chunk', hash.encode('hex')
self.chunk_incref(hash)
return hash
return hash, size
def seen_chunk(self, hash):
return self.chunkmap.get(hash, 0) > 0
@ -189,14 +192,20 @@ class Archiver(object):
return {'type': 'DIR', 'path': path}
def process_file(self, path, cache):
print 'Adding: %s...' % path,
sys.stdout.flush()
with open(path, 'rb') as fd:
size = 0
origsize = 0
compsize = 0
chunks = []
for chunk in chunker(fd, CHUNKSIZE, self.cache.summap):
size += len(chunk)
chunks.append(cache.add_chunk(chunk))
origsize += len(chunk)
id, size = cache.add_chunk(chunk)
compsize += size
chunks.append(id)
path = path.lstrip('/\\:')
print 'File: %s (%d chunks)' % (path, len(chunks))
ratio = origsize and compsize * 100 / origsize or 0
print '(%d chunks: %d%%)' % (len(chunks), ratio)
return {'type': 'FILE', 'path': path, 'size': size, 'chunks': chunks}
def run(self):

View File

@ -49,11 +49,11 @@ def chunker(fd, chunk_size, chunks):
>>> list(chunker(fd, 4, chunks))
['ABCD', 'EFGH', 'IJ', 'KLMN']
"""
data = 'X' + fd.read(chunk_size * 2)
data = 'X' + fd.read(chunk_size * 3)
i = 1
sum = checksum(data[:chunk_size])
while True:
if len(data) - i - 2 <= chunk_size:
if len(data) - i <= chunk_size * 2:
data += fd.read(chunk_size * 2)
if i == chunk_size + 1:
yield data[1:chunk_size + 1]
@ -62,14 +62,13 @@ def chunker(fd, chunk_size, chunks):
if len(data) - i <= chunk_size: # EOF?
if len(data) > chunk_size + 1:
yield data[1:len(data) - chunk_size]
yield data[-chunk_size:]
yield data[:chunk_size]
else:
yield data[1:]
return
sum = roll_checksum(sum, data[i - 1], data[i - 1 + chunk_size], chunk_size)
#print data[i:i + chunk_size], sum
if chunks.get(sum):
print 'Woot', i
if i > 1:
yield data[1:i]
yield data[i:i + chunk_size]