implement text_to_json / binary_to_json, see #6151

binary bytes:
- json_key = <key>_b64
- json_value == base64(value)

text (potentially with surrogate escapes):
- json_key1 = <key>
- json_value1 = value_text (s-e replaced by ?)
- json_key2 = <key>_b64
- json_value2 = base64(value_binary)

json_key2/_value2 is only present if value_text required
replacement of surrogate escapes (and thus does not represent
the original value, but just an approximation).
value_binary then gives the original bytes value (e.g. a
non-utf8 bytes sequence).
This commit is contained in:
Thomas Waldmann 2022-12-29 00:38:42 +01:00
parent ca1f1281d5
commit 32d430a1b0
No known key found for this signature in database
GPG Key ID: 243ACFA951F78E01
3 changed files with 80 additions and 1 deletions

View File

@ -19,7 +19,8 @@ from .fs import HardLinkManager
from .misc import sysinfo, log_multi, consume
from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
from .parseformat import bin_to_hex, safe_encode, safe_decode
from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
from .parseformat import text_to_json, binary_to_json, remove_surrogates
from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal

View File

@ -1,4 +1,5 @@
import argparse
import base64
import hashlib
import json
import os
@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
return s.encode("utf-8", errors).decode("utf-8")
def binary_to_json(key, value):
assert isinstance(key, str)
assert isinstance(value, bytes)
return {key + "_b64": base64.b64encode(value).decode("ascii")}
def text_to_json(key, value):
"""
Return a dict made from key/value that can be fed safely into a JSON encoder.
JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
But sometimes we have to deal with such values and we do it like this:
- <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
- <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
"""
coding = "utf-8"
assert isinstance(key, str)
assert isinstance(value, str) # str might contain surrogate escapes
data = {}
try:
value.encode(coding, errors="strict") # check if pure unicode
except UnicodeEncodeError:
# value has surrogate escape sequences
value_replace_encoded = value.encode(coding, errors="replace")
data[key] = value_replace_encoded.decode(coding, errors="strict")
value_bytes = value.encode(coding, errors="surrogateescape")
data.update(binary_to_json(key, value_bytes))
else:
# value is pure unicode
data[key] = value
# we do not give the b64 representation, not needed
return data
def eval_escapes(s):
"""Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
return s.encode("ascii", "backslashreplace").decode("unicode-escape")

View File

@ -1,3 +1,4 @@
import base64
import errno
import getpass
import hashlib
@ -42,6 +43,7 @@ from ..helpers import dash_open
from ..helpers import iter_separated
from ..helpers import eval_escapes
from ..helpers import safe_unlink
from ..helpers import text_to_json, binary_to_json
from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
from ..platform import is_cygwin
@ -53,6 +55,46 @@ def test_bin_to_hex():
assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
@pytest.mark.parametrize(
"key,value",
[("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
)
def test_binary_to_json(key, value):
key_b64 = key + "_b64"
d = binary_to_json(key, value)
assert key_b64 in d
assert base64.b64decode(d[key_b64]) == value
@pytest.mark.parametrize(
"key,value,strict",
[
("key", "abc", True),
("key", "äöü", True),
("key", "", True),
("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
],
)
def test_text_to_json(key, value, strict):
key_b64 = key + "_b64"
d = text_to_json(key, value)
value_b = value.encode("utf-8", errors="surrogateescape")
if strict:
# no surrogate-escapes, just unicode text
assert key in d
assert d[key] == value_b.decode("utf-8", errors="strict")
assert d[key].encode("utf-8", errors="strict") == value_b
assert key_b64 not in d # not needed. pure valid unicode.
else:
# requiring surrogate-escapes. text has replacement chars, base64 representation is present.
assert key in d
assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
assert key_b64 in d
assert base64.b64decode(d[key_b64]) == value_b
class TestLocationWithoutEnv:
@pytest.fixture
def keys_dir(self, tmpdir, monkeypatch):