diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 36f6aa5c0..bef660894 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -19,7 +19,8 @@ from .fs import HardLinkManager from .misc import sysinfo, log_multi, consume from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper from .parseformat import bin_to_hex, safe_encode, safe_decode -from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval +from .parseformat import text_to_json, binary_to_json, remove_surrogates +from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 686431931..d9f8b22e6 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -1,4 +1,5 @@ import argparse +import base64 import hashlib import json import os @@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"): return s.encode("utf-8", errors).decode("utf-8") +def binary_to_json(key, value): + assert isinstance(key, str) + assert isinstance(value, bytes) + return {key + "_b64": base64.b64encode(value).decode("ascii")} + + +def text_to_json(key, value): + """ + Return a dict made from key/value that can be fed safely into a JSON encoder. + + JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes). + + But sometimes we have to deal with such values and we do it like this: + - : value as pure unicode text (surrogate escapes, if any, replaced by ?) + - _b64: value as base64 encoded binary representation (only set if value has surrogate-escapes) + """ + coding = "utf-8" + assert isinstance(key, str) + assert isinstance(value, str) # str might contain surrogate escapes + data = {} + try: + value.encode(coding, errors="strict") # check if pure unicode + except UnicodeEncodeError: + # value has surrogate escape sequences + value_replace_encoded = value.encode(coding, errors="replace") + data[key] = value_replace_encoded.decode(coding, errors="strict") + value_bytes = value.encode(coding, errors="surrogateescape") + data.update(binary_to_json(key, value_bytes)) + else: + # value is pure unicode + data[key] = value + # we do not give the b64 representation, not needed + return data + + def eval_escapes(s): """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`).""" return s.encode("ascii", "backslashreplace").decode("unicode-escape") diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index b8764b008..7b0830d9a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -1,3 +1,4 @@ +import base64 import errno import getpass import hashlib @@ -42,6 +43,7 @@ from ..helpers import dash_open from ..helpers import iter_separated from ..helpers import eval_escapes from ..helpers import safe_unlink +from ..helpers import text_to_json, binary_to_json from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded from ..platform import is_cygwin @@ -53,6 +55,46 @@ def test_bin_to_hex(): assert bin_to_hex(b"\x00\x01\xff") == "0001ff" +@pytest.mark.parametrize( + "key,value", + [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")], +) +def test_binary_to_json(key, value): + key_b64 = key + "_b64" + d = binary_to_json(key, value) + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value + + +@pytest.mark.parametrize( + "key,value,strict", + [ + ("key", "abc", True), + ("key", "äöü", True), + ("key", "", True), + ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False), + ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False), + ], +) +def test_text_to_json(key, value, strict): + key_b64 = key + "_b64" + d = text_to_json(key, value) + value_b = value.encode("utf-8", errors="surrogateescape") + if strict: + # no surrogate-escapes, just unicode text + assert key in d + assert d[key] == value_b.decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value_b + assert key_b64 not in d # not needed. pure valid unicode. + else: + # requiring surrogate-escapes. text has replacement chars, base64 representation is present. + assert key in d + assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace") + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value_b + + class TestLocationWithoutEnv: @pytest.fixture def keys_dir(self, tmpdir, monkeypatch):