diff --git a/docs/resources.rst b/docs/resources.rst index 59fa0310a..4ae184946 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -38,4 +38,4 @@ Software - `BorgWeb - a very simple web UI for BorgBackup `_ - some other stuff found at the `BorgBackup Github organisation `_ -- `atticmatic `_ (includes borgmatic) +- `borgmatic `_ - simple wrapper script for BorgBackup that creates and prunes backups diff --git a/src/borg/archive.py b/src/borg/archive.py index c10d57d60..a362f9542 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -690,12 +690,40 @@ def _open_rb(path): return os.open(path, flags_normal) +def valid_msgpacked_dict(d, keys_serialized): + """check if the data looks like a msgpacked dict""" + d_len = len(d) + if d_len == 0: + return False + if d[0] & 0xf0 == 0x80: # object is a fixmap (up to 15 elements) + offs = 1 + elif d[0] == 0xde: # object is a map16 (up to 2^16-1 elements) + offs = 3 + else: + # object is not a map (dict) + # note: we must not have dicts with > 2^16-1 elements + return False + if d_len <= offs: + return False + # is the first dict key a bytestring? + if d[offs] & 0xe0 == 0xa0: # key is a small bytestring (up to 31 chars) + pass + elif d[offs] in (0xd9, 0xda, 0xdb): # key is a str8, str16 or str32 + pass + else: + # key is not a bytestring + return False + # is the bytestring any of the expected key names? + key_serialized = d[offs:] + return any(key_serialized.startswith(pattern) for pattern in keys_serialized) + + class RobustUnpacker: """A restartable/robust version of the streaming msgpack unpacker """ - def __init__(self, validator): + def __init__(self, validator, item_keys): super().__init__() - self.item_keys = [msgpack.packb(name.encode()) for name in ITEM_KEYS] + self.item_keys = [msgpack.packb(name.encode()) for name in item_keys] self.validator = validator self._buffered_data = [] self._resync = False @@ -720,18 +748,10 @@ def __next__(self): while self._resync: if not data: raise StopIteration - # Abort early if the data does not look like a serialized dict - if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ((data[1] & 0xe0) != 0xa0): + # Abort early if the data does not look like a serialized item dict + if not valid_msgpacked_dict(data, self.item_keys): data = data[1:] continue - # Make sure it looks like an item dict - for pattern in self.item_keys: - if data[1:].startswith(pattern): - break - else: - data = data[1:] - continue - self._unpacker = msgpack.Unpacker(object_hook=StableDict) self._unpacker.feed(data) try: @@ -806,7 +826,12 @@ def init_chunks(self): self.chunks[id_] = init_entry def identify_key(self, repository): - cdata = repository.get(next(self.chunks.iteritems())[0]) + try: + some_chunkid, _ = next(self.chunks.iteritems()) + except StopIteration: + # repo is completely empty, no chunks + return None + cdata = repository.get(some_chunkid) return key_factory(repository, cdata) def verify_data(self): @@ -834,13 +859,26 @@ def rebuild_manifest(self): Iterates through all objects in the repository looking for archive metadata blocks. """ + required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS) + + def valid_archive(obj): + if not isinstance(obj, dict): + return False + keys = set(obj) + return required_archive_keys.issubset(keys) + logger.info('Rebuilding missing manifest, this might take some time...') + # as we have lost the manifest, we do not know any more what valid item keys we had. + # collecting any key we encounter in a damaged repo seems unwise, thus we just use + # the hardcoded list from the source code. thus, it is not recommended to rebuild a + # lost manifest on a older borg version than the most recent one that was ever used + # within this repository (assuming that newer borg versions support more item keys). manifest = Manifest(self.key, self.repository) + archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS] for chunk_id, _ in self.chunks.iteritems(): cdata = self.repository.get(chunk_id) _, data = self.key.decrypt(chunk_id, cdata) - # Some basic sanity checks of the payload before feeding it into msgpack - if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ((data[1] & 0xe0) != 0xa0): + if not valid_msgpacked_dict(data, archive_keys_serialized): continue if b'cmdline' not in data or b'\xa7version\x01' not in data: continue @@ -850,7 +888,7 @@ def rebuild_manifest(self): # msgpack with invalid data except (TypeError, ValueError, StopIteration): continue - if isinstance(archive, dict) and b'items' in archive and b'cmdline' in archive: + if valid_archive(archive): logger.info('Found archive %s', archive[b'name'].decode('utf-8')) manifest.archives[archive[b'name'].decode('utf-8')] = {b'id': chunk_id, b'time': archive[b'time']} logger.info('Manifest rebuild complete.') @@ -912,7 +950,10 @@ def robust_iterator(archive): Missing item chunks will be skipped and the msgpack stream will be restarted """ - unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and 'path' in item) + item_keys = frozenset(key.encode() for key in self.manifest.item_keys) + required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS) + unpacker = RobustUnpacker(lambda item: isinstance(item, dict) and 'path' in item, + self.manifest.item_keys) _state = 0 def missing_chunk_detector(chunk_id): @@ -927,6 +968,12 @@ def report(msg, chunk_id, chunk_no): self.error_found = True logger.error(msg) + def valid_item(obj): + if not isinstance(obj, StableDict): + return False + keys = set(obj) + return required_item_keys.issubset(keys) and keys.issubset(item_keys) + i = 0 for state, items in groupby(archive[b'items'], missing_chunk_detector): items = list(items) @@ -942,7 +989,7 @@ def report(msg, chunk_id, chunk_no): unpacker.feed(data) try: for item in unpacker: - if isinstance(item, dict): + if valid_item(item): yield Item(internal_dict=item) else: report('Did not get expected metadata dict when unpacking item metadata', chunk_id, i) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index f0087ceda..06d8d0085 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -26,7 +26,8 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics from .cache import Cache from .constants import * # NOQA -from .helpers import Error +from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR +from .helpers import Error, NoManifestError from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec from .helpers import ItemFormatter, format_time, format_file_size, format_archive from .helpers import safe_encode, remove_surrogates, bin_to_hex @@ -665,10 +666,11 @@ def do_rename(self, args, repository, manifest, key, cache, archive): cache.commit() return self.exit_code - @with_repository(exclusive=True) - def do_delete(self, args, repository, manifest, key): + @with_repository(exclusive=True, manifest=False) + def do_delete(self, args, repository): """Delete an existing repository or archive""" if args.location.archive: + manifest, key = Manifest.load(repository) with Cache(repository, key, manifest, lock_wait=self.lock_wait) as cache: archive = Archive(repository, key, manifest, args.location.archive, cache=cache) stats = Statistics() @@ -685,9 +687,15 @@ def do_delete(self, args, repository, manifest, key): else: if not args.cache_only: msg = [] - msg.append("You requested to completely DELETE the repository *including* all archives it contains:") - for archive_info in manifest.list_archive_infos(sort_by='ts'): - msg.append(format_archive(archive_info)) + try: + manifest, key = Manifest.load(repository) + except NoManifestError: + msg.append("You requested to completely DELETE the repository *including* all archives it may contain.") + msg.append("This repository seems to have no manifest, so we can't tell anything about its contents.") + else: + msg.append("You requested to completely DELETE the repository *including* all archives it contains:") + for archive_info in manifest.list_archive_infos(sort_by='ts'): + msg.append(format_archive(archive_info)) msg.append("Type 'YES' if you understand this and want to continue: ") msg = '\n'.join(msg) if not yes(msg, false_msg="Aborting.", truish=('YES', ), diff --git a/src/borg/constants.py b/src/borg/constants.py index 1a970887d..471cf8974 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -1,7 +1,18 @@ # this set must be kept complete, otherwise the RobustUnpacker might malfunction: -ITEM_KEYS = set(['path', 'source', 'rdev', 'chunks', 'hardlink_master', - 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', - 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', ]) +ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'hardlink_master', + 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', + 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', ]) + +# this is the set of keys that are always present in items: +REQUIRED_ITEM_KEYS = frozenset(['path', 'mtime', ]) + +# this set must be kept complete, otherwise rebuild_manifest might malfunction: +ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end', + 'comment', 'chunker_params', + 'recreate_cmdline', 'recreate_source_id', 'recreate_args']) + +# this is the set of keys that are always present in archives: +REQUIRED_ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'time', ]) ARCHIVE_TEXT_KEYS = (b'name', b'comment', b'hostname', b'username', b'time', b'time_end') diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 4176825d6..f3d7f0f6b 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -84,6 +84,10 @@ class ExtensionModuleError(Error): """The Borg binary extension modules do not seem to be properly installed""" +class NoManifestError(Error): + """Repository has no manifest.""" + + def check_extension_modules(): from . import platform if hashindex.API_VERSION != 2: @@ -100,11 +104,12 @@ class Manifest: MANIFEST_ID = b'\0' * 32 - def __init__(self, key, repository): + def __init__(self, key, repository, item_keys=None): self.archives = {} self.config = {} self.key = key self.repository = repository + self.item_keys = frozenset(item_keys) if item_keys is not None else ITEM_KEYS @property def id_str(self): @@ -113,7 +118,11 @@ def id_str(self): @classmethod def load(cls, repository, key=None): from .key import key_factory - cdata = repository.get(cls.MANIFEST_ID) + from .repository import Repository + try: + cdata = repository.get(cls.MANIFEST_ID) + except Repository.ObjectNotFound: + raise NoManifestError if not key: key = key_factory(repository, cdata) manifest = cls(key, repository) @@ -127,6 +136,8 @@ def load(cls, repository, key=None): if manifest.timestamp: manifest.timestamp = manifest.timestamp.decode('ascii') manifest.config = m[b'config'] + # valid item keys are whatever is known in the repo or every key we know + manifest.item_keys = ITEM_KEYS | frozenset(key.decode() for key in m.get(b'item_keys', [])) return manifest, key def write(self): @@ -136,6 +147,7 @@ def write(self): 'archives': self.archives, 'timestamp': self.timestamp, 'config': self.config, + 'item_keys': tuple(self.item_keys), })) self.id = self.key.id_hash(data) self.repository.put(self.MANIFEST_ID, self.key.encrypt(Chunk(data))) diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index a2ee23f5e..70a03eb6b 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -6,7 +6,7 @@ import pytest import msgpack -from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, Statistics +from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked_dict, ITEM_KEYS, Statistics from ..item import Item from ..key import PlaintextKey from ..helpers import Manifest @@ -139,7 +139,7 @@ def _validator(self, value): return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz') def process(self, input): - unpacker = RobustUnpacker(validator=self._validator) + unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS) result = [] for should_sync, chunks in input: if should_sync: @@ -184,3 +184,35 @@ def test_corrupt_chunk(self): input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + + +@pytest.fixture +def item_keys_serialized(): + return [msgpack.packb(name) for name in ITEM_KEYS] + + +@pytest.mark.parametrize('packed', + [b'', b'x', b'foobar', ] + + [msgpack.packb(o) for o in ( + [None, 0, 0.0, False, '', {}, [], ()] + + [42, 23.42, True, b'foobar', {b'foo': b'bar'}, [b'foo', b'bar'], (b'foo', b'bar')] + )]) +def test_invalid_msgpacked_item(packed, item_keys_serialized): + assert not valid_msgpacked_dict(packed, item_keys_serialized) + + +@pytest.mark.parametrize('packed', + [msgpack.packb(o) for o in [ + {b'path': b'/a/b/c'}, # small (different msgpack mapping type!) + dict((k, b'') for k in ITEM_KEYS), # as big (key count) as it gets + dict((k, b'x' * 1000) for k in ITEM_KEYS), # as big (key count and volume) as it gets + ]]) +def test_valid_msgpacked_items(packed, item_keys_serialized): + assert valid_msgpacked_dict(packed, item_keys_serialized) + + +def test_key_length_msgpacked_items(): + key = b'x' * 32 # 31 bytes is the limit for fixstr msgpack type + data = {key: b''} + item_keys_serialized = [msgpack.packb(key), ] + assert valid_msgpacked_dict(msgpack.packb(data), item_keys_serialized)