mirror of https://github.com/borgbackup/borg.git
implement text_to_json / binary_to_json, see #6151
binary bytes: - json_key = <key>_b64 - json_value == base64(value) text (potentially with surrogate escapes): - json_key1 = <key> - json_value1 = value_text (s-e replaced by ?) - json_key2 = <key>_b64 - json_value2 = base64(value_binary) json_key2/_value2 is only present if value_text required replacement of surrogate escapes (and thus does not represent the original value, but just an approximation). value_binary then gives the original bytes value (e.g. a non-utf8 bytes sequence).
This commit is contained in:
parent
ca1f1281d5
commit
32d430a1b0
|
@ -19,7 +19,8 @@ from .fs import HardLinkManager
|
|||
from .misc import sysinfo, log_multi, consume
|
||||
from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
|
||||
from .parseformat import bin_to_hex, safe_encode, safe_decode
|
||||
from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
|
||||
from .parseformat import text_to_json, binary_to_json, remove_surrogates
|
||||
from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
|
||||
from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
|
||||
from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
|
||||
from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
|
@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
|
|||
return s.encode("utf-8", errors).decode("utf-8")
|
||||
|
||||
|
||||
def binary_to_json(key, value):
|
||||
assert isinstance(key, str)
|
||||
assert isinstance(value, bytes)
|
||||
return {key + "_b64": base64.b64encode(value).decode("ascii")}
|
||||
|
||||
|
||||
def text_to_json(key, value):
|
||||
"""
|
||||
Return a dict made from key/value that can be fed safely into a JSON encoder.
|
||||
|
||||
JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
|
||||
|
||||
But sometimes we have to deal with such values and we do it like this:
|
||||
- <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
|
||||
- <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
|
||||
"""
|
||||
coding = "utf-8"
|
||||
assert isinstance(key, str)
|
||||
assert isinstance(value, str) # str might contain surrogate escapes
|
||||
data = {}
|
||||
try:
|
||||
value.encode(coding, errors="strict") # check if pure unicode
|
||||
except UnicodeEncodeError:
|
||||
# value has surrogate escape sequences
|
||||
value_replace_encoded = value.encode(coding, errors="replace")
|
||||
data[key] = value_replace_encoded.decode(coding, errors="strict")
|
||||
value_bytes = value.encode(coding, errors="surrogateescape")
|
||||
data.update(binary_to_json(key, value_bytes))
|
||||
else:
|
||||
# value is pure unicode
|
||||
data[key] = value
|
||||
# we do not give the b64 representation, not needed
|
||||
return data
|
||||
|
||||
|
||||
def eval_escapes(s):
|
||||
"""Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
|
||||
return s.encode("ascii", "backslashreplace").decode("unicode-escape")
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import base64
|
||||
import errno
|
||||
import getpass
|
||||
import hashlib
|
||||
|
@ -42,6 +43,7 @@ from ..helpers import dash_open
|
|||
from ..helpers import iter_separated
|
||||
from ..helpers import eval_escapes
|
||||
from ..helpers import safe_unlink
|
||||
from ..helpers import text_to_json, binary_to_json
|
||||
from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
|
||||
from ..platform import is_cygwin
|
||||
|
||||
|
@ -53,6 +55,46 @@ def test_bin_to_hex():
|
|||
assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key,value",
|
||||
[("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
|
||||
)
|
||||
def test_binary_to_json(key, value):
|
||||
key_b64 = key + "_b64"
|
||||
d = binary_to_json(key, value)
|
||||
assert key_b64 in d
|
||||
assert base64.b64decode(d[key_b64]) == value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key,value,strict",
|
||||
[
|
||||
("key", "abc", True),
|
||||
("key", "äöü", True),
|
||||
("key", "", True),
|
||||
("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
|
||||
("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
|
||||
],
|
||||
)
|
||||
def test_text_to_json(key, value, strict):
|
||||
key_b64 = key + "_b64"
|
||||
d = text_to_json(key, value)
|
||||
value_b = value.encode("utf-8", errors="surrogateescape")
|
||||
if strict:
|
||||
# no surrogate-escapes, just unicode text
|
||||
assert key in d
|
||||
assert d[key] == value_b.decode("utf-8", errors="strict")
|
||||
assert d[key].encode("utf-8", errors="strict") == value_b
|
||||
assert key_b64 not in d # not needed. pure valid unicode.
|
||||
else:
|
||||
# requiring surrogate-escapes. text has replacement chars, base64 representation is present.
|
||||
assert key in d
|
||||
assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
|
||||
assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
|
||||
assert key_b64 in d
|
||||
assert base64.b64decode(d[key_b64]) == value_b
|
||||
|
||||
|
||||
class TestLocationWithoutEnv:
|
||||
@pytest.fixture
|
||||
def keys_dir(self, tmpdir, monkeypatch):
|
||||
|
|
Loading…
Reference in New Issue