mirror of https://github.com/borgbackup/borg.git
Switched to using avro serialization for archives
This commit is contained in:
parent
5cd5d761cd
commit
6eb65d07f9
|
@ -1 +1,38 @@
|
|||
# This is a python package
|
||||
# This is a python package
|
||||
|
||||
ARCHIVE_SCHEMA = """
|
||||
{
|
||||
"name": "Archive",
|
||||
"type": "record",
|
||||
"fields" : [
|
||||
{ "name": "name", "type": "string" },
|
||||
{ "name": "ts", "type": "string" },
|
||||
{ "name": "chunks", "type": { "type": "array", "items":
|
||||
{ "type": "record",
|
||||
"name": "Chunk",
|
||||
"fields": [
|
||||
{ "name": "id", "type": {"type": "fixed", "size": 32, "name": "sha256" }},
|
||||
{ "name": "size", "type": "int" }
|
||||
]
|
||||
}
|
||||
}},
|
||||
{ "name": "items", "type": {"type": "array", "items":
|
||||
{
|
||||
"type": "record",
|
||||
"name": "Item",
|
||||
"fields": [
|
||||
{ "name": "type", "type":
|
||||
{ "name": "ItemType", "type": "enum", "symbols": ["FILE", "DIRECTORY"] } },
|
||||
{ "name": "path", "type": "string" },
|
||||
{ "name": "size", "type": ["null", "long"] },
|
||||
{ "name": "chunks", "type": ["null",
|
||||
{ "type": "array", "items": "int" }
|
||||
]}
|
||||
]
|
||||
}
|
||||
}}
|
||||
]
|
||||
}
|
||||
"""
|
||||
from avro import schema
|
||||
archive_schema = schema.parse(ARCHIVE_SCHEMA)
|
||||
|
|
|
@ -2,10 +2,14 @@ import os
|
|||
import hashlib
|
||||
import logging
|
||||
import zlib
|
||||
import cPickle
|
||||
import argparse
|
||||
import sys
|
||||
from cStringIO import StringIO
|
||||
from datetime import datetime
|
||||
|
||||
from avro import io
|
||||
|
||||
from dedupestore import archive_schema
|
||||
from chunkifier import chunkify
|
||||
from cache import Cache, NS_ARCHIVES, NS_CHUNKS
|
||||
from bandstore import BandStore
|
||||
|
@ -41,26 +45,39 @@ class Archive(object):
|
|||
data = self.store.get(NS_ARCHIVES, id)
|
||||
if hashlib.sha256(data).digest() != id:
|
||||
raise Exception('Archive hash did not match')
|
||||
archive = cPickle.loads(zlib.decompress(data))
|
||||
buffer = StringIO(zlib.decompress(data))
|
||||
reader = io.DatumReader(archive_schema)
|
||||
decoder = io.BinaryDecoder(buffer)
|
||||
archive = reader.read(decoder)
|
||||
self.items = archive['items']
|
||||
self.name = archive['name']
|
||||
self.chunks = archive['chunks']
|
||||
for i, (id, csize, osize) in enumerate(archive['chunks']):
|
||||
self.chunk_idx[i] = id
|
||||
for i, chunk in enumerate(archive['chunks']):
|
||||
self.chunk_idx[i] = chunk['id']
|
||||
|
||||
def save(self, name):
|
||||
archive = {'name': name, 'items': self.items, 'chunks': self.chunks}
|
||||
data = zlib.compress(cPickle.dumps(archive))
|
||||
archive = {
|
||||
'name': name,
|
||||
'ts': datetime.utcnow().isoformat(),
|
||||
'items': self.items,
|
||||
'chunks': self.chunks
|
||||
}
|
||||
writer = StringIO()
|
||||
encoder = io.BinaryEncoder(writer)
|
||||
datum_writer = io.DatumWriter(archive_schema)
|
||||
datum_writer.write(archive, encoder)
|
||||
data = zlib.compress(writer.getvalue())
|
||||
print 'archive size: %d' % len(data)
|
||||
self.id = hashlib.sha256(data).digest()
|
||||
self.store.put(NS_ARCHIVES, self.id, data)
|
||||
self.store.commit()
|
||||
|
||||
def add_chunk(self, id, csize, osize):
|
||||
def add_chunk(self, id, size):
|
||||
try:
|
||||
return self.chunk_idx[id]
|
||||
except KeyError:
|
||||
idx = len(self.chunks)
|
||||
self.chunks.append((id, csize, osize))
|
||||
self.chunks.append(dict(id=id, size=size))
|
||||
self.chunk_idx[id] = idx
|
||||
return idx
|
||||
|
||||
|
@ -77,10 +94,10 @@ class Archive(object):
|
|||
chunk_count.setdefault(id, 0)
|
||||
chunk_count[id] += 1
|
||||
for id, c in chunk_count.items():
|
||||
count, csize, osize = cache.chunkmap[id]
|
||||
total_csize += csize
|
||||
count, size = cache.chunkmap[id]
|
||||
total_csize += size
|
||||
if c == count:
|
||||
total_usize += csize
|
||||
total_usize += size
|
||||
return dict(osize=total_osize, csize=total_csize, usize=total_usize)
|
||||
|
||||
def list(self):
|
||||
|
@ -93,7 +110,7 @@ class Archive(object):
|
|||
assert item['path'][0] not in ('/', '\\', ':')
|
||||
path = os.path.join(dest, item['path'])
|
||||
logging.info(path)
|
||||
if item['type'] == 'DIR':
|
||||
if item['type'] == 'DIRECTORY':
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
if item['type'] == 'FILE':
|
||||
|
@ -142,7 +159,7 @@ class Archive(object):
|
|||
if name in cache.archives:
|
||||
raise NameError('Archive already exists')
|
||||
for path in paths:
|
||||
for root, dirs, files in os.walk(path):
|
||||
for root, dirs, files in os.walk(unicode(path)):
|
||||
for d in dirs:
|
||||
p = os.path.join(root, d)
|
||||
self.items.append(self.process_dir(p, cache))
|
||||
|
@ -158,7 +175,7 @@ class Archive(object):
|
|||
def process_dir(self, path, cache):
|
||||
path = path.lstrip('/\\:')
|
||||
logging.info(path)
|
||||
return {'type': 'DIR', 'path': path}
|
||||
return {'type': 'DIRECTORY', 'path': path}
|
||||
|
||||
def process_file(self, path, cache):
|
||||
try:
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import cPickle
|
||||
import hashlib
|
||||
import os
|
||||
import sys
|
||||
import zlib
|
||||
from avro import io
|
||||
from cStringIO import StringIO
|
||||
import cPickle
|
||||
|
||||
from dedupestore import archive_schema
|
||||
|
||||
NS_ARCHIVES = 'ARCHIVES'
|
||||
NS_CHUNKS = 'CHUNKS'
|
||||
|
@ -48,13 +51,17 @@ class Cache(object):
|
|||
data = self.store.get(NS_ARCHIVES, id)
|
||||
if hashlib.sha256(data).digest() != id:
|
||||
raise Exception('Archive hash did not match')
|
||||
archive = cPickle.loads(zlib.decompress(data))
|
||||
|
||||
buffer = StringIO(zlib.decompress(data))
|
||||
reader = io.DatumReader(archive_schema)
|
||||
decoder = io.BinaryDecoder(buffer)
|
||||
archive = reader.read(decoder)
|
||||
self.archives[archive['name']] = id
|
||||
for id, csize, osize in archive['chunks']:
|
||||
for id, size in archive['chunks']:
|
||||
if self.seen_chunk(id):
|
||||
self.chunk_incref(id)
|
||||
else:
|
||||
self.init_chunk(id, csize, osize)
|
||||
self.init_chunk(id, size)
|
||||
|
||||
def save(self):
|
||||
assert self.store.state == self.store.OPEN
|
||||
|
@ -78,27 +85,27 @@ class Cache(object):
|
|||
data = hashlib.sha256(data).digest() + data
|
||||
csize = len(data)
|
||||
self.store.put(NS_CHUNKS, id, data)
|
||||
return self.init_chunk(id, csize, osize)
|
||||
return self.init_chunk(id, csize)
|
||||
|
||||
def init_chunk(self, id, csize, osize):
|
||||
self.chunkmap[id] = (1, csize, osize)
|
||||
return id, csize, osize
|
||||
def init_chunk(self, id, size):
|
||||
self.chunkmap[id] = (1, size)
|
||||
return id, size
|
||||
|
||||
def seen_chunk(self, id):
|
||||
count, csize, osize = self.chunkmap.get(id, (0, 0, 0))
|
||||
count, size = self.chunkmap.get(id, (0, 0))
|
||||
return count
|
||||
|
||||
def chunk_incref(self, id):
|
||||
count, csize, osize = self.chunkmap[id]
|
||||
self.chunkmap[id] = (count + 1, csize, osize)
|
||||
return id, csize, osize
|
||||
count, size = self.chunkmap[id]
|
||||
self.chunkmap[id] = (count + 1, size)
|
||||
return id, size
|
||||
|
||||
def chunk_decref(self, id):
|
||||
count, csize, osize = self.chunkmap[id]
|
||||
count, size = self.chunkmap[id]
|
||||
if count == 1:
|
||||
del self.chunkmap[id]
|
||||
self.store.delete(NS_CHUNKS, id)
|
||||
else:
|
||||
self.chunkmap[id] = (count - 1, csize, osize)
|
||||
self.chunkmap[id] = (count - 1, size)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue