1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-25 01:06:50 +00:00

Initial rough implementation of chunkification cache

This commit is contained in:
Jonas Borgström 2010-10-25 22:31:18 +02:00
parent 00a98082ab
commit e181829365
2 changed files with 59 additions and 16 deletions

View file

@ -243,6 +243,7 @@ def process_symlink(self, path, st):
})
def process_file(self, path, st, cache):
safe_path = path.lstrip('/\\:')
# Is it a hard link?
if st.st_nlink > 1:
source = self.hard_links.get((st.st_ino, st.st_dev))
if (st.st_ino, st.st_dev) in self.hard_links:
@ -252,18 +253,34 @@ def process_file(self, path, st, cache):
return
else:
self.hard_links[st.st_ino, st.st_dev] = safe_path
try:
fd = open(path, 'rb')
except IOError, e:
logging.error(e)
return
with fd:
logging.info(safe_path)
chunks = []
size = 0
for chunk in chunkify(fd, CHUNK_SIZE, 30):
chunks.append(self.process_chunk(chunk, cache))
size += len(chunk)
logging.info(safe_path)
path_hash = self.crypto.id_hash(path.encode('utf-8'))
ids, size = cache.file_known_and_unchanged(path_hash, st)
if ids is not None:
# Make sure all ids are available
for id in ids:
if not cache.seen_chunk(id):
ids = None
break
else:
chunks = [self.process_chunk2(id, cache) for id in ids]
# Only chunkify the file if needed
if ids is None:
try:
fd = open(path, 'rb')
except IOError, e:
logging.error(e)
return
with fd:
size = 0
ids = []
chunks = []
for chunk in chunkify(fd, CHUNK_SIZE, 30):
ids.append(self.crypto.id_hash(chunk))
chunks.append(chunk)
size += len(chunk)
cache.memorize_file_chunks(path_hash, st, ids)
chunks = [self.process_chunk(chunk, cache) for chunk in chunks]
self.items.append({
'type': 'FILE', 'path': safe_path, 'chunks': chunks, 'size': size,
'mode': st.st_mode,
@ -272,6 +289,16 @@ def process_file(self, path, st, cache):
'ctime': st.st_ctime, 'mtime': st.st_mtime,
})
def process_chunk2(self, id, cache):
try:
return self.chunk_idx[id]
except KeyError:
idx = len(self.chunks)
size = cache.chunk_incref(id)
self.chunks.append((id, size))
self.chunk_idx[id] = idx
return idx
def process_chunk(self, data, cache):
id = self.crypto.id_hash(data)
try:

View file

@ -14,6 +14,7 @@ def __init__(self, store, crypto):
self.path = os.path.join(os.path.expanduser('~'), '.dedupestore', 'cache',
'%s.cache' % self.store.uuid)
self.tid = -1
self.file_chunks = {}
self.open()
if self.tid != self.store.tid:
self.init(crypto)
@ -22,13 +23,15 @@ def open(self):
if not os.path.exists(self.path):
return
cache = msgpack.unpackb(open(self.path, 'rb').read())
version = cache.get('version')
if version != 1:
logging.error('Unsupported cache version %r' % version)
return
assert cache['version'] == 1
if cache['store'] != self.store.uuid:
raise Exception('Cache UUID mismatch')
self.chunkmap = cache['chunkmap']
# Discard old file_chunks entries
for hash, entry in cache['file_chunks'].iteritems():
count = entry[0]
if count < 8:
self.file_chunks[hash] = [count + 1] + list(entry[1:])
self.tid = cache['tid']
def init(self, crypto):
@ -56,6 +59,7 @@ def save(self):
'store': self.store.uuid,
'chunkmap': self.chunkmap,
'tid': self.store.tid,
'file_chunks': self.file_chunks,
}
data = msgpack.packb(cache)
cachedir = os.path.dirname(self.path)
@ -90,4 +94,16 @@ def chunk_decref(self, id):
else:
self.chunkmap[id] = (count - 1, size)
def file_known_and_unchanged(self, path_hash, st):
entry = self.file_chunks.get(path_hash)
if (entry and entry[1] == st.st_ino
and entry[2] == st.st_size and entry[3] == st.st_mtime):
entry[0] = 0 # reset entry age
return entry[4], entry[2]
else:
return None, 0
def memorize_file_chunks(self, path_hash, st, ids):
# Entry: Age, inode, size, mtime, chunk ids
self.file_chunks[path_hash] = 0, st.st_ino, st.st_size, st.st_mtime, ids