From 918e0b2a5290cddc25988aa19da27681fa60d57f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 9 Jun 2016 21:08:39 +0200 Subject: [PATCH] fix resync and msgpacked item qualifier, fixes #1135 when trying to resync and skip invalid data, borg tries to qualify a byte sequence as valid-looking msgpacked item metadata dict (or not) before even invoking msgpack's unpack. besides previously hard to understand code, there were 2 issues: - a missing check for map16 - this type is what msgpack uses if the dict has more than 15 items (could happen in future, not for 1.0.x). - missing checks for str8/16/32 - str16 is what msgpack uses if the bytestring has more than 31 bytes (borg does not have that long key names, thus this wasn't causing any harm) this misqualification (valid data considered invalid) could lead to a wrong resync, skipping valid items. added more comments and tests. --- borg/archive.py | 40 +++++++++++++++++++++++++++++---------- borg/testsuite/archive.py | 35 +++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/borg/archive.py b/borg/archive.py index b10bac07a..93d90fc9c 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -592,6 +592,34 @@ ITEM_KEYS = set([b'path', b'source', b'rdev', b'chunks', b'xattrs', b'bsdflags', b'acl_nfs4', b'acl_access', b'acl_default', b'acl_extended', ]) +def valid_msgpacked_item(d, item_keys_serialized): + """check if the data looks like a msgpacked item metadata dict""" + d_len = len(d) + if d_len == 0: + return False + if d[0] & 0xf0 == 0x80: # object is a fixmap (up to 15 elements) + offs = 1 + elif d[0] == 0xde: # object is a map16 (up to 2^16-1 elements) + offs = 3 + else: + # object is not a map (dict) + # note: we must not have item dicts with > 2^16-1 elements + return False + if d_len <= offs: + return False + # is the first dict key a bytestring? + if d[offs] & 0xe0 == 0xa0: # key is a small bytestring (up to 31 chars) + pass + elif d[offs] in (0xd9, 0xda, 0xdb): # key is a str8, str16 or str32 + pass + else: + # key is not a bytestring + return False + # is the bytestring any of the expected key names? + key_serialized = d[offs:] + return any(key_serialized.startswith(pattern) for pattern in item_keys_serialized) + + class RobustUnpacker: """A restartable/robust version of the streaming msgpack unpacker """ @@ -622,18 +650,10 @@ class RobustUnpacker: while self._resync: if not data: raise StopIteration - # Abort early if the data does not look like a serialized dict - if len(data) < 2 or ((data[0] & 0xf0) != 0x80) or ((data[1] & 0xe0) != 0xa0): + # Abort early if the data does not look like a serialized item dict + if not valid_msgpacked_item(data, self.item_keys): data = data[1:] continue - # Make sure it looks like an item dict - for pattern in self.item_keys: - if data[1:].startswith(pattern): - break - else: - data = data[1:] - continue - self._unpacker = msgpack.Unpacker(object_hook=StableDict) self._unpacker.feed(data) try: diff --git a/borg/testsuite/archive.py b/borg/testsuite/archive.py index 662d776c9..78ea25465 100644 --- a/borg/testsuite/archive.py +++ b/borg/testsuite/archive.py @@ -2,8 +2,9 @@ from datetime import datetime, timezone from unittest.mock import Mock import msgpack +import pytest -from ..archive import Archive, CacheChunkBuffer, RobustUnpacker +from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked_item, ITEM_KEYS from ..key import PlaintextKey from ..helpers import Manifest from . import BaseTestCase @@ -112,3 +113,35 @@ class RobustUnpackerTestCase(BaseTestCase): input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + + +@pytest.fixture +def item_keys_serialized(): + return [msgpack.packb(name) for name in ITEM_KEYS] + + +@pytest.mark.parametrize('packed', + [b'', b'x', b'foobar', ] + + [msgpack.packb(o) for o in ( + [None, 0, 0.0, False, '', {}, [], ()] + + [42, 23.42, True, b'foobar', {b'foo': b'bar'}, [b'foo', b'bar'], (b'foo', b'bar')] + )]) +def test_invalid_msgpacked_item(packed, item_keys_serialized): + assert not valid_msgpacked_item(packed, item_keys_serialized) + + +@pytest.mark.parametrize('packed', + [msgpack.packb(o) for o in [ + {b'path': b'/a/b/c'}, # small (different msgpack mapping type!) + dict((k, b'') for k in ITEM_KEYS), # as big (key count) as it gets + dict((k, b'x' * 1000) for k in ITEM_KEYS), # as big (key count and volume) as it gets + ]]) +def test_valid_msgpacked_items(packed, item_keys_serialized): + assert valid_msgpacked_item(packed, item_keys_serialized) + + +def test_key_length_msgpacked_items(): + key = b'x' * 32 # 31 bytes is the limit for fixstr msgpack type + data = {key: b''} + item_keys_serialized = [msgpack.packb(key), ] + assert valid_msgpacked_item(msgpack.packb(data), item_keys_serialized)