From 78121a8d04383df4c7d478a76b4716f196939a8a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Jun 2016 23:38:12 +0200 Subject: [PATCH 1/2] store item_keys into manifest, fixes #1147 we need a list of valid item metadata keys. using a list stored in the repo manifest is more future-proof than the hardcoded ITEM_KEYS in the source code. keys that are in union(item_keys_from_repo, item_keys_from_source) are considered valid. --- borg/archive.py | 20 +++++++++++++------- borg/helpers.py | 8 +++++++- borg/testsuite/archive.py | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/borg/archive.py b/borg/archive.py index 5a5d6125c..8638bc22a 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -224,7 +224,7 @@ Number of files: {0.stats.nfiles}'''.format( yield item def add_item(self, item): - unknown_keys = set(item) - ITEM_KEYS + unknown_keys = set(item) - self.manifest.item_keys assert not unknown_keys, ('unknown item metadata keys detected, please update ITEM_KEYS: %s', ','.join(k.decode('ascii') for k in unknown_keys)) if self.show_progress: @@ -587,9 +587,9 @@ Number of files: {0.stats.nfiles}'''.format( # this set must be kept complete, otherwise the RobustUnpacker might malfunction: -ITEM_KEYS = set([b'path', b'source', b'rdev', b'chunks', - b'mode', b'user', b'group', b'uid', b'gid', b'mtime', b'atime', b'ctime', - b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ]) +ITEM_KEYS = frozenset([b'path', b'source', b'rdev', b'chunks', + b'mode', b'user', b'group', b'uid', b'gid', b'mtime', b'atime', b'ctime', + b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ]) def valid_msgpacked_item(d, item_keys_serialized): @@ -623,9 +623,9 @@ def valid_msgpacked_item(d, item_keys_serialized): class RobustUnpacker: """A restartable/robust version of the streaming msgpack unpacker """ - def __init__(self, validator): + def __init__(self, validator, item_keys): super().__init__() - self.item_keys = [msgpack.packb(name) for name in ITEM_KEYS] + self.item_keys = [msgpack.packb(name) for name in item_keys] self.validator = validator self._buffered_data = [] self._resync = False @@ -729,6 +729,11 @@ class ArchiveChecker: Iterates through all objects in the repository looking for archive metadata blocks. """ logger.info('Rebuilding missing manifest, this might take some time...') + # as we have lost the manifest, we do not know any more what valid item keys we had. + # collecting any key we encounter in a damaged repo seems unwise, thus we just use + # the hardcoded list from the source code. thus, it is not recommended to rebuild a + # lost manifest on a older borg version than the most recent one that was ever used + # within this repository (assuming that newer borg versions support more item keys). manifest = Manifest(self.key, self.repository) for chunk_id, _ in self.chunks.iteritems(): cdata = self.repository.get(chunk_id) @@ -806,7 +811,8 @@ class ArchiveChecker: Missing item chunks will be skipped and the msgpack stream will be restarted """ - unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item) + unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item, + self.manifest.item_keys) _state = 0 def missing_chunk_detector(chunk_id): diff --git a/borg/helpers.py b/borg/helpers.py index aefcdffc5..e9b8d9a3a 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -85,16 +85,19 @@ class Manifest: MANIFEST_ID = b'\0' * 32 - def __init__(self, key, repository): + def __init__(self, key, repository, item_keys=None): + from .archive import ITEM_KEYS self.archives = {} self.config = {} self.key = key self.repository = repository + self.item_keys = frozenset(item_keys) if item_keys is not None else ITEM_KEYS @classmethod def load(cls, repository, key=None): from .key import key_factory from .repository import Repository + from .archive import ITEM_KEYS try: cdata = repository.get(cls.MANIFEST_ID) except Repository.ObjectNotFound: @@ -112,6 +115,8 @@ class Manifest: if manifest.timestamp: manifest.timestamp = manifest.timestamp.decode('ascii') manifest.config = m[b'config'] + # valid item keys are whatever is known in the repo or every key we know + manifest.item_keys = frozenset(m.get(b'item_keys', [])) | ITEM_KEYS return manifest, key def write(self): @@ -121,6 +126,7 @@ class Manifest: 'archives': self.archives, 'timestamp': self.timestamp, 'config': self.config, + 'item_keys': tuple(self.item_keys), })) self.id = self.key.id_hash(data) self.repository.put(self.MANIFEST_ID, self.key.encrypt(data)) diff --git a/borg/testsuite/archive.py b/borg/testsuite/archive.py index 78ea25465..66676f334 100644 --- a/borg/testsuite/archive.py +++ b/borg/testsuite/archive.py @@ -68,7 +68,7 @@ class RobustUnpackerTestCase(BaseTestCase): return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz') def process(self, input): - unpacker = RobustUnpacker(validator=self._validator) + unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS) result = [] for should_sync, chunks in input: if should_sync: From a7b516514949db068fbc8747a8cec076c3a70f69 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 12 Jun 2016 00:10:39 +0200 Subject: [PATCH 2/2] better validation of item metadata dicts, fixes #1130 the previous check only checked that we got a dict, but did not validate the dict keys. this triggered issues with e.g. (invalid) integer keys. now it validates the keys: - some required keys must be present - the set of keys is a subset of all valid keys --- borg/archive.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/borg/archive.py b/borg/archive.py index 8638bc22a..370134671 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -591,6 +591,9 @@ ITEM_KEYS = frozenset([b'path', b'source', b'rdev', b'chunks', b'mode', b'user', b'group', b'uid', b'gid', b'mtime', b'atime', b'ctime', b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ]) +# this is the set of keys that are always present in items: +REQUIRED_ITEM_KEYS = frozenset([b'path', b'mtime', ]) + def valid_msgpacked_item(d, item_keys_serialized): """check if the data looks like a msgpacked item metadata dict""" @@ -811,8 +814,8 @@ class ArchiveChecker: Missing item chunks will be skipped and the msgpack stream will be restarted """ - unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item, - self.manifest.item_keys) + item_keys = self.manifest.item_keys + unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and b'path' in item, item_keys) _state = 0 def missing_chunk_detector(chunk_id): @@ -827,6 +830,12 @@ class ArchiveChecker: self.error_found = True logger.error(msg) + def valid_item(obj): + if not isinstance(obj, StableDict): + return False + keys = set(obj) + return REQUIRED_ITEM_KEYS.issubset(keys) and keys.issubset(item_keys) + i = 0 for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) @@ -841,7 +850,7 @@ class ArchiveChecker: unpacker.feed(self.key.decrypt(chunk_id, cdata)) try: for item in unpacker: - if isinstance(item, dict): + if valid_item(item): yield item else: report('Did not get expected metadata dict when unpacking item metadata', chunk_id, i)