cleanup msgpack related str/bytes mess, fixes #968

see ticket and borg.helpers.msgpack docstring. this changeset implements the full migration to msgpack 2.0 spec (use_bin_type=True, raw=False). still needed compat to the past is done via want_bytes decoder in borg.item.
2025-02-24 23:13:25 +00:00 · 2022-05-06 03:59:10 +02:00 · 2022-05-06 03:59:10 +02:00 · 8e87f1111b
commit 8e87f1111b
parent f8dbe5b542
11 changed files with 124 additions and 129 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1718,13 +1718,10 @@ def rebuild_manifest(self):

        Iterates through all objects in the repository looking for archive metadata blocks.
        """
-        required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS)
-
        def valid_archive(obj):
            if not isinstance(obj, dict):
                return False
-            keys = set(obj)
-            return required_archive_keys.issubset(keys)
+            return REQUIRED_ARCHIVE_KEYS.issubset(obj)

        logger.info('Rebuilding missing manifest, this might take some time...')
        # as we have lost the manifest, we do not know any more what valid item keys we had.
@ -1904,7 +1901,7 @@ def list_keys_safe(keys):
            def valid_item(obj):
                if not isinstance(obj, StableDict):
                    return False, 'not a dictionary'
-                keys = set(k.decode('utf-8', errors='replace') for k in obj)
+                keys = set(obj)
                if not required_item_keys.issubset(keys):
                    return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
                if not keys.issubset(item_keys):
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -2331,7 +2331,7 @@ def output(fd):

            unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
            first = True
-            for item_id in archive_org_dict[b'items']:
+            for item_id in archive_org_dict['items']:
                data = key.decrypt(item_id, repository.get(item_id))
                unpacker.feed(data)
                for item in unpacker:
--- a/src/borg/crypto/key.py
+++ b/src/borg/crypto/key.py
@ -232,24 +232,24 @@ def unpack_and_verify_manifest(self, data, force_tam_not_required=False):
        unpacker = get_limited_unpacker('manifest')
        unpacker.feed(data)
        unpacked = unpacker.unpack()
-        if b'tam' not in unpacked:
+        if 'tam' not in unpacked:
            if tam_required:
                raise TAMRequiredError(self.repository._location.canonical_path())
            else:
                logger.debug('TAM not found and not required')
                return unpacked, False
-        tam = unpacked.pop(b'tam', None)
+        tam = unpacked.pop('tam', None)
        if not isinstance(tam, dict):
            raise TAMInvalid()
-        tam_type = tam.get(b'type', b'<none>').decode('ascii', 'replace')
+        tam_type = tam.get('type', '<none>')
        if tam_type != 'HKDF_HMAC_SHA512':
            if tam_required:
                raise TAMUnsupportedSuiteError(repr(tam_type))
            else:
                logger.debug('Ignoring TAM made with unsupported suite, since TAM is not required: %r', tam_type)
                return unpacked, False
-        tam_hmac = tam.get(b'hmac')
-        tam_salt = tam.get(b'salt')
+        tam_hmac = tam.get('hmac')
+        tam_salt = tam.get('salt')
        if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes):
            raise TAMInvalid()
        offset = data.index(tam_hmac)
--- a/src/borg/helpers/msgpack.py
+++ b/src/borg/helpers/msgpack.py
@ -2,8 +2,7 @@
 wrapping msgpack
 ================

-Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it -
-to avoid having lots of clutter in the calling code. see tickets #968 and #3632.
+We wrap msgpack here the way we need it - to avoid having lots of clutter in the calling code.

 Packing
 -------
@ -22,30 +21,27 @@

 Unpacking
 ---------
- raw = True (the old way, used by borg <= 1.3)
-  This is currently still needed to not try to decode "raw" msgpack objects.
-  These could come either from str (new or old msgpack) or bytes (old msgpack).
-  Thus, we basically must know what we want and either keep the bytes we get
-  or decode them to str, if we want str.
-
- raw = False (the new way)
-  This can be used in future, when we do not have to deal with data any more that was packed the old way.
+- raw = False (used by borg since borg 1.3)
+  We already can use this with borg 1.3 due to the want_bytes decoder.
+  This decoder can be removed in future, when we do not have to deal with data any more that was packed the old way.
  It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str.

+- raw = True (the old way, used by borg < 1.3)
+
 - unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False).

-As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968.
-borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
-But from now on, borg only **writes** new data according to the new msgpack spec,
-thus we can complete the fix for #968 in a later borg release.
+As of borg 1.3, we have fixed most of the msgpack str/bytes mess, #968.
+Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
+But from now on, borg only **writes** new data according to the new msgpack 2.0 spec,
+thus we can remove some legacy support in a later borg release (some places are marked with "legacy").

 current way in msgpack terms
 ----------------------------

 - pack with use_bin_type=True (according to msgpack 2.0 spec)
 - packs str -> raw and bytes -> bin
- unpack with raw=True (aka "the old way")
- unpacks raw to bytes (thus we always need to decode manually if we want str)
+- unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape')
+- unpacks bin to bytes and raw to str (thus we need to re-encode manually if we want bytes from "raw")
 """

 from .datastruct import StableDict
@ -66,8 +62,8 @@
 version = mp_version

 USE_BIN_TYPE = True
-RAW = True  # should become False later when we do not need to read old stuff any more
-UNICODE_ERRORS = 'surrogateescape'  # previously done by safe_encode, safe_decode
+RAW = False
+UNICODE_ERRORS = 'surrogateescape'


 class PackException(Exception):
@ -161,7 +157,7 @@ def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
 def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
           strict_map_key=False,
           **kwargs):
-    # assert raw == RAW
+    assert raw == RAW
    assert unicode_errors == UNICODE_ERRORS
    try:
        kw = dict(raw=raw, unicode_errors=unicode_errors,
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@ -60,6 +60,15 @@ def fix_tuple_of_str_and_int(t):
    return t


+def want_bytes(v):
+    """we know that we want bytes and the value should be bytes"""
+    # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False
+    if isinstance(v, str):
+        v = v.encode('utf-8', errors='surrogateescape')
+    assert isinstance(v, bytes)
+    return v
+
+
 class PropDict:
    """
    Manage a dictionary via properties.
@ -204,10 +213,10 @@ class Item(PropDict):
    user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None')
    group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None')

-    acl_access = PropDict._make_property('acl_access', bytes)
-    acl_default = PropDict._make_property('acl_default', bytes)
-    acl_extended = PropDict._make_property('acl_extended', bytes)
-    acl_nfs4 = PropDict._make_property('acl_nfs4', bytes)
+    acl_access = PropDict._make_property('acl_access', bytes, decode=want_bytes)
+    acl_default = PropDict._make_property('acl_default', bytes, decode=want_bytes)
+    acl_extended = PropDict._make_property('acl_extended', bytes, decode=want_bytes)
+    acl_nfs4 = PropDict._make_property('acl_nfs4', bytes, decode=want_bytes)

    mode = PropDict._make_property('mode', int)
    uid = PropDict._make_property('uid', int)
@ -224,7 +233,7 @@ class Item(PropDict):
    # compatibility note: this is a new feature, in old archives size will be missing.
    size = PropDict._make_property('size', int)

-    hlid = PropDict._make_property('hlid', bytes)  # hard link id: same value means same hard link.
+    hlid = PropDict._make_property('hlid', bytes, decode=want_bytes)  # hard link id: same value means same hard link.
    hardlink_master = PropDict._make_property('hardlink_master', bool)  # legacy

    chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
@ -363,9 +372,9 @@ class EncryptedKey(PropDict):
    version = PropDict._make_property('version', int)
    algorithm = PropDict._make_property('algorithm', str)
    iterations = PropDict._make_property('iterations', int)
-    salt = PropDict._make_property('salt', bytes)
-    hash = PropDict._make_property('hash', bytes)
-    data = PropDict._make_property('data', bytes)
+    salt = PropDict._make_property('salt', bytes, decode=want_bytes)
+    hash = PropDict._make_property('hash', bytes, decode=want_bytes)
+    data = PropDict._make_property('data', bytes, decode=want_bytes)
    argon2_time_cost = PropDict._make_property('argon2_time_cost', int)
    argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int)
    argon2_parallelism = PropDict._make_property('argon2_parallelism', int)
@ -399,10 +408,10 @@ class Key(PropDict):
    __slots__ = ("_dict", )  # avoid setting attributes not supported by properties

    version = PropDict._make_property('version', int)
-    repository_id = PropDict._make_property('repository_id', bytes)
-    enc_key = PropDict._make_property('enc_key', bytes)
-    enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes)
-    id_key = PropDict._make_property('id_key', bytes)
+    repository_id = PropDict._make_property('repository_id', bytes, decode=want_bytes)
+    enc_key = PropDict._make_property('enc_key', bytes, decode=want_bytes)
+    enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes, decode=want_bytes)
+    id_key = PropDict._make_property('id_key', bytes, decode=want_bytes)
    chunk_seed = PropDict._make_property('chunk_seed', int)
    tam_required = PropDict._make_property('tam_required', bool)

@ -443,7 +452,7 @@ class ArchiveItem(PropDict):
    chunker_params = PropDict._make_property('chunker_params', tuple)
    recreate_cmdline = PropDict._make_property('recreate_cmdline', list)  # list of s-e-str
    # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2
-    recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
+    recreate_source_id = PropDict._make_property('recreate_source_id', bytes, decode=want_bytes)
    recreate_args = PropDict._make_property('recreate_args', list)  # list of s-e-str
    recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list)  # list of tuples
    size = PropDict._make_property('size', int)
--- a/src/borg/remote.py
+++ b/src/borg/remote.py
@ -38,8 +38,7 @@

 RPC_PROTOCOL_VERSION = 2
 BORG_VERSION = parse_version(__version__)
-MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r'  # pack
-MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r'  # unpack
+MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r'

 MAX_INFLIGHT = 100

@ -139,10 +138,6 @@ def __init__(self, data):
 }


-def decode_keys(d):
-    return {k.decode(): d[k] for k in d}
-
-
 class RepositoryServer:  # pragma: no cover
    rpc_methods = (
        '__len__',
@ -217,14 +212,13 @@ def serve(self):
                for unpacked in unpacker:
                    if isinstance(unpacked, dict):
                        dictFormat = True
-                        msgid = unpacked[MSGIDB]
-                        method = unpacked[MSGB].decode()
-                        args = decode_keys(unpacked[ARGSB])
+                        msgid = unpacked[MSGID]
+                        method = unpacked[MSG]
+                        args = unpacked[ARGS]
                    elif isinstance(unpacked, tuple) and len(unpacked) == 4:
                        dictFormat = False
                        # The first field 'type' was always 1 and has always been ignored
                        _, msgid, method, args = unpacked
-                        method = method.decode()
                        args = self.positional_to_named(method, args)
                    else:
                        if self.repository is not None:
@ -308,7 +302,7 @@ def negotiate(self, client_data):
        # clients since 1.1.0b3 use a dict as client_data
        # clients since 1.1.0b6 support json log format from server
        if isinstance(client_data, dict):
-            self.client_version = client_data[b'client_version']
+            self.client_version = client_data['client_version']
            level = logging.getLevelName(logging.getLogger('').level)
            setup_logging(is_serve=True, json=True, level=level)
            logger.debug('Initialized logging system for JSON-based protocol')
@ -370,7 +364,6 @@ def open(self, path, create=False, lock_wait=None, lock=True, exclusive=None, ap
        return self.repository.id

    def inject_exception(self, kind):
-        kind = kind.decode()
        s1 = 'test string'
        s2 = 'test string2'
        if kind == 'DoesNotExist':
@ -484,35 +477,35 @@ class RemoteRepository:

    class RPCError(Exception):
        def __init__(self, unpacked):
-            # for borg < 1.1: unpacked only has b'exception_class' as key
-            # for borg 1.1+: unpacked has keys: b'exception_args', b'exception_full', b'exception_short', b'sysinfo'
+            # for borg < 1.1: unpacked only has 'exception_class' as key
+            # for borg 1.1+: unpacked has keys: 'exception_args', 'exception_full', 'exception_short', 'sysinfo'
            self.unpacked = unpacked

        def get_message(self):
-            if b'exception_short' in self.unpacked:
-                return b'\n'.join(self.unpacked[b'exception_short']).decode()
+            if 'exception_short' in self.unpacked:
+                return '\n'.join(self.unpacked['exception_short'])
            else:
                return self.exception_class

        @property
        def traceback(self):
-            return self.unpacked.get(b'exception_trace', True)
+            return self.unpacked.get('exception_trace', True)

        @property
        def exception_class(self):
-            return self.unpacked[b'exception_class'].decode()
+            return self.unpacked['exception_class']

        @property
        def exception_full(self):
-            if b'exception_full' in self.unpacked:
-                return b'\n'.join(self.unpacked[b'exception_full']).decode()
+            if 'exception_full' in self.unpacked:
+                return '\n'.join(self.unpacked['exception_full'])
            else:
                return self.get_message() + '\nRemote Exception (see remote log for the traceback)'

        @property
        def sysinfo(self):
-            if b'sysinfo' in self.unpacked:
-                return self.unpacked[b'sysinfo'].decode()
+            if 'sysinfo' in self.unpacked:
+                return self.unpacked['sysinfo']
            else:
                return ''

@ -577,9 +570,9 @@ def __init__(self, location, create=False, exclusive=False, lock_wait=None, lock
                raise ConnectionClosedWithHint('Is borg working on the server?') from None
            if version == RPC_PROTOCOL_VERSION:
                self.dictFormat = False
-            elif isinstance(version, dict) and b'server_version' in version:
+            elif isinstance(version, dict) and 'server_version' in version:
                self.dictFormat = True
-                self.server_version = version[b'server_version']
+                self.server_version = version['server_version']
            else:
                raise Exception('Server insisted on using unsupported protocol version %s' % version)

@ -734,9 +727,9 @@ def pop_preload_msgid(chunkid):
            return msgid

        def handle_error(unpacked):
-            error = unpacked[b'exception_class'].decode()
-            old_server = b'exception_args' not in unpacked
-            args = unpacked.get(b'exception_args')
+            error = unpacked['exception_class']
+            old_server = 'exception_args' not in unpacked
+            args = unpacked.get('exception_args')

            if error == 'DoesNotExist':
                raise Repository.DoesNotExist(self.location.processed)
@ -748,29 +741,29 @@ def handle_error(unpacked):
                if old_server:
                    raise IntegrityError('(not available)')
                else:
-                    raise IntegrityError(args[0].decode())
+                    raise IntegrityError(args[0])
            elif error == 'AtticRepository':
                if old_server:
                    raise Repository.AtticRepository('(not available)')
                else:
-                    raise Repository.AtticRepository(args[0].decode())
+                    raise Repository.AtticRepository(args[0])
            elif error == 'PathNotAllowed':
                if old_server:
                    raise PathNotAllowed('(unknown)')
                else:
-                    raise PathNotAllowed(args[0].decode())
+                    raise PathNotAllowed(args[0])
            elif error == 'ParentPathDoesNotExist':
-                raise Repository.ParentPathDoesNotExist(args[0].decode())
+                raise Repository.ParentPathDoesNotExist(args[0])
            elif error == 'ObjectNotFound':
                if old_server:
                    raise Repository.ObjectNotFound('(not available)', self.location.processed)
                else:
-                    raise Repository.ObjectNotFound(args[0].decode(), self.location.processed)
+                    raise Repository.ObjectNotFound(args[0], self.location.processed)
            elif error == 'InvalidRPCMethod':
                if old_server:
                    raise InvalidRPCMethod('(not available)')
                else:
-                    raise InvalidRPCMethod(args[0].decode())
+                    raise InvalidRPCMethod(args[0])
            else:
                raise self.RPCError(unpacked)

@ -789,10 +782,10 @@ def handle_error(unpacked):
                try:
                    unpacked = self.responses.pop(waiting_for[0])
                    waiting_for.pop(0)
-                    if b'exception_class' in unpacked:
+                    if 'exception_class' in unpacked:
                        handle_error(unpacked)
                    else:
-                        yield unpacked[RESULTB]
+                        yield unpacked[RESULT]
                        if not waiting_for and not calls:
                            return
                except KeyError:
@ -809,10 +802,10 @@ def handle_error(unpacked):
                        else:
                            return
                    else:
-                        if b'exception_class' in unpacked:
+                        if 'exception_class' in unpacked:
                            handle_error(unpacked)
                        else:
-                            yield unpacked[RESULTB]
+                            yield unpacked[RESULT]
            if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT):
                w_fds = [self.stdin_fd]
            else:
@ -829,26 +822,26 @@ def handle_error(unpacked):
                    self.unpacker.feed(data)
                    for unpacked in self.unpacker:
                        if isinstance(unpacked, dict):
-                            msgid = unpacked[MSGIDB]
+                            msgid = unpacked[MSGID]
                        elif isinstance(unpacked, tuple) and len(unpacked) == 4:
                            # The first field 'type' was always 1 and has always been ignored
                            _, msgid, error, res = unpacked
                            if error:
                                # ignore res, because it is only a fixed string anyway.
-                                unpacked = {MSGIDB: msgid, b'exception_class': error}
+                                unpacked = {MSGID: msgid, 'exception_class': error}
                            else:
-                                unpacked = {MSGIDB: msgid, RESULTB: res}
+                                unpacked = {MSGID: msgid, RESULT: res}
                        else:
                            raise UnexpectedRPCDataFormatFromServer(data)
                        if msgid in self.ignore_responses:
                            self.ignore_responses.remove(msgid)
                            # async methods never return values, but may raise exceptions.
-                            if b'exception_class' in unpacked:
+                            if 'exception_class' in unpacked:
                                self.async_responses[msgid] = unpacked
                            else:
                                # we currently do not have async result values except "None",
                                # so we do not add them into async_responses.
-                                if unpacked[RESULTB] is not None:
+                                if unpacked[RESULT] is not None:
                                    self.async_responses[msgid] = unpacked
                        else:
                            self.responses[msgid] = unpacked
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@ -516,16 +516,16 @@ def _read_integrity(self, transaction_id, key):
                integrity = msgpack.unpack(fd)
        except FileNotFoundError:
            return
-        if integrity.get(b'version') != 2:
-            logger.warning('Unknown integrity data version %r in %s', integrity.get(b'version'), integrity_file)
+        if integrity.get('version') != 2:
+            logger.warning('Unknown integrity data version %r in %s', integrity.get('version'), integrity_file)
            return
-        return integrity[key].decode()
+        return integrity[key]

    def open_index(self, transaction_id, auto_recover=True):
        if transaction_id is None:
            return NSIndex()
        index_path = os.path.join(self.path, 'index.%d' % transaction_id)
-        integrity_data = self._read_integrity(transaction_id, b'index')
+        integrity_data = self._read_integrity(transaction_id, 'index')
        try:
            with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
                return NSIndex.read(fd)
@ -575,7 +575,7 @@ def prepare_txn(self, transaction_id, do_cleanup=True):
                self.io.cleanup(transaction_id)
            hints_path = os.path.join(self.path, 'hints.%d' % transaction_id)
            index_path = os.path.join(self.path, 'index.%d' % transaction_id)
-            integrity_data = self._read_integrity(transaction_id, b'hints')
+            integrity_data = self._read_integrity(transaction_id, 'hints')
            try:
                with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd:
                    hints = msgpack.unpack(fd)
@ -588,23 +588,23 @@ def prepare_txn(self, transaction_id, do_cleanup=True):
                self.check_transaction()
                self.prepare_txn(transaction_id)
                return
-            if hints[b'version'] == 1:
+            if hints['version'] == 1:
                logger.debug('Upgrading from v1 hints.%d', transaction_id)
-                self.segments = hints[b'segments']
+                self.segments = hints['segments']
                self.compact = FreeSpace()
                self.storage_quota_use = 0
                self.shadow_index = {}
-                for segment in sorted(hints[b'compact']):
+                for segment in sorted(hints['compact']):
                    logger.debug('Rebuilding sparse info for segment %d', segment)
                    self._rebuild_sparse(segment)
                logger.debug('Upgrade to v2 hints complete')
-            elif hints[b'version'] != 2:
-                raise ValueError('Unknown hints file version: %d' % hints[b'version'])
+            elif hints['version'] != 2:
+                raise ValueError('Unknown hints file version: %d' % hints['version'])
            else:
-                self.segments = hints[b'segments']
-                self.compact = FreeSpace(hints[b'compact'])
-                self.storage_quota_use = hints.get(b'storage_quota_use', 0)
-                self.shadow_index = hints.get(b'shadow_index', {})
+                self.segments = hints['segments']
+                self.compact = FreeSpace(hints['compact'])
+                self.storage_quota_use = hints.get('storage_quota_use', 0)
+                self.shadow_index = hints.get('shadow_index', {})
            self.log_storage_quota()
            # Drop uncommitted segments in the shadow index
            for key, shadowed_segments in self.shadow_index.items():
@ -621,16 +621,16 @@ def rename_tmp(file):
            os.rename(file + '.tmp', file)

        hints = {
-            b'version': 2,
-            b'segments': self.segments,
-            b'compact': self.compact,
-            b'storage_quota_use': self.storage_quota_use,
-            b'shadow_index': self.shadow_index,
+            'version': 2,
+            'segments': self.segments,
+            'compact': self.compact,
+            'storage_quota_use': self.storage_quota_use,
+            'shadow_index': self.shadow_index,
        }
        integrity = {
            # Integrity version started at 2, the current hints version.
            # Thus, integrity version == hints version, for now.
-            b'version': 2,
+            'version': 2,
        }
        transaction_id = self.io.get_segments_transaction_id()
        assert transaction_id is not None
@ -647,7 +647,7 @@ def rename_tmp(file):
        with IntegrityCheckedFile(hints_file + '.tmp', filename=hints_name, write=True) as fd:
            msgpack.pack(hints, fd)
            flush_and_sync(fd)
-        integrity[b'hints'] = fd.integrity_data
+        integrity['hints'] = fd.integrity_data

        # Write repository index
        index_name = 'index.%d' % transaction_id
@ -656,7 +656,7 @@ def rename_tmp(file):
            # XXX: Consider using SyncFile for index write-outs.
            self.index.write(fd)
            flush_and_sync(fd)
-        integrity[b'index'] = fd.integrity_data
+        integrity['index'] = fd.integrity_data

        # Write integrity file, containing checksums of the hints and index files
        integrity_name = 'integrity.%d' % transaction_id
--- a/src/borg/testsuite/archive.py
+++ b/src/borg/testsuite/archive.py
@ -171,7 +171,7 @@ def make_chunks(self, items):
        return b''.join(msgpack.packb({'path': item}) for item in items)

    def _validator(self, value):
-        return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz')
+        return isinstance(value, dict) and value.get('path') in ('foo', 'bar', 'boo', 'baz')

    def process(self, input):
        unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS)
@ -190,10 +190,10 @@ def test_extra_garbage_no_sync(self):
                  (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])]
        result = self.process(chunks)
        self.assert_equal(result, [
-            {b'path': b'foo'}, {b'path': b'bar'},
+            {'path': 'foo'}, {'path': 'bar'},
            103, 97, 114, 98, 97, 103, 101,
-            {b'path': b'boo'},
-            {b'path': b'baz'}])
+            {'path': 'boo'},
+            {'path': 'baz'}])

    def split(self, left, length):
        parts = []
@ -206,19 +206,19 @@ def test_correct_stream(self):
        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2)
        input = [(False, chunks)]
        result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'bar'}, {'path': 'boo'}, {'path': 'baz'}])

    def test_missing_chunk(self):
        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
        input = [(False, chunks[:3]), (True, chunks[4:])]
        result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}])

    def test_corrupt_chunk(self):
        chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4)
        input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])]
        result = self.process(input)
-        self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}])
+        self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}])


@pytest.fixture
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -3623,14 +3623,14 @@ def test_init_defaults_to_argon2(self):
        self.cmd('init', '--encryption=repokey', self.repository_location)
        with Repository(self.repository_path) as repository:
            key = msgpack.unpackb(a2b_base64(repository.load_key()))
-        assert key[b'algorithm'] == b'argon2 chacha20-poly1305'
+        assert key['algorithm'] == 'argon2 chacha20-poly1305'

    def test_init_with_explicit_key_algorithm(self):
        """https://github.com/borgbackup/borg/issues/747#issuecomment-1076160401"""
        self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location)
        with Repository(self.repository_path) as repository:
            key = msgpack.unpackb(a2b_base64(repository.load_key()))
-        assert key[b'algorithm'] == b'sha256'
+        assert key['algorithm'] == 'sha256'

    def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, expected_algorithm):
        self.cmd('init', '--encryption=repokey', '--key-algorithm', given_algorithm, self.repository_location)
@ -3640,7 +3640,7 @@ def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, ex

        with Repository(self.repository_path) as repository:
            key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm.encode()
+            assert key['algorithm'] == expected_algorithm

    def test_change_passphrase_does_not_change_algorithm_argon2(self):
        self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
@ -3655,7 +3655,7 @@ def verify_change_location_does_not_change_algorithm(self, given_algorithm, expe

        with Repository(self.repository_path) as repository:
            key = msgpack.unpackb(a2b_base64(repository.load_key()))
-            assert key[b'algorithm'] == expected_algorithm.encode()
+            assert key['algorithm'] == expected_algorithm

    def test_change_location_does_not_change_algorithm_argon2(self):
        self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305')
@ -3969,7 +3969,7 @@ def test_not_required(self):
            key.change_passphrase(key._passphrase)

            manifest = msgpack.unpackb(key.decrypt(Manifest.MANIFEST_ID, repository.get(Manifest.MANIFEST_ID)))
-            del manifest[b'tam']
+            del manifest['tam']
            repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb(manifest)))
            repository.commit(compact=False)
        output = self.cmd('list', '--debug', self.repository_location)
--- a/src/borg/testsuite/key.py
+++ b/src/borg/testsuite/key.py
@ -360,23 +360,23 @@ def test_round_trip(self, key):
        assert blob.startswith(b'\x82')

        unpacked = msgpack.unpackb(blob)
-        assert unpacked[b'tam'][b'type'] == b'HKDF_HMAC_SHA512'
+        assert unpacked['tam']['type'] == 'HKDF_HMAC_SHA512'

        unpacked, verified = key.unpack_and_verify_manifest(blob)
        assert verified
-        assert unpacked[b'foo'] == b'bar'
-        assert b'tam' not in unpacked
+        assert unpacked['foo'] == 'bar'
+        assert 'tam' not in unpacked

-    @pytest.mark.parametrize('which', (b'hmac', b'salt'))
+    @pytest.mark.parametrize('which', ('hmac', 'salt'))
    def test_tampered(self, key, which):
        data = {'foo': 'bar'}
        blob = key.pack_and_authenticate_metadata(data)
        assert blob.startswith(b'\x82')

        unpacked = msgpack.unpackb(blob, object_hook=StableDict)
-        assert len(unpacked[b'tam'][which]) == 64
-        unpacked[b'tam'][which] = unpacked[b'tam'][which][0:32] + bytes(32)
-        assert len(unpacked[b'tam'][which]) == 64
+        assert len(unpacked['tam'][which]) == 64
+        unpacked['tam'][which] = unpacked['tam'][which][0:32] + bytes(32)
+        assert len(unpacked['tam'][which]) == 64
        blob = msgpack.packb(unpacked)

        with pytest.raises(TAMInvalid):
@ -421,4 +421,4 @@ def to_dict(key):
    load_me = RepoKey.detect(repository, manifest_data=None)

    assert to_dict(load_me) == to_dict(save_me)
-    assert msgpack.unpackb(a2b_base64(saved))[b'algorithm'] == expected_algorithm.encode()
+    assert msgpack.unpackb(a2b_base64(saved))['algorithm'] == expected_algorithm
--- a/src/borg/testsuite/repository.py
+++ b/src/borg/testsuite/repository.py
@ -655,8 +655,8 @@ def _subtly_corrupted_hints_setup(self):
            hints = msgpack.unpack(fd)
            fd.seek(0)
            # Corrupt segment refcount
-            assert hints[b'segments'][2] == 1
-            hints[b'segments'][2] = 0
+            assert hints['segments'][2] == 1
+            hints['segments'][2] = 0
            msgpack.pack(hints, fd)
            fd.truncate()