mirror of
https://github.com/borgbackup/borg.git
synced 2025-02-26 07:53:58 +00:00
Merge pull request #7232 from ThomasWaldmann/json_b64
implement and use (text|binary)_to_json
This commit is contained in:
commit
d49665526c
7 changed files with 141 additions and 27 deletions
|
@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale:
|
|||
export LC_CTYPE=en_US.UTF-8
|
||||
|
||||
|
||||
Dealing with non-unicode byte sequences and JSON limitations
|
||||
------------------------------------------------------------
|
||||
|
||||
Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C).
|
||||
|
||||
Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems
|
||||
still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often
|
||||
have wild mixtures of misc. encodings, sometimes even very broken stuff.
|
||||
|
||||
borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using
|
||||
UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points
|
||||
(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence
|
||||
will be reproduced exactly.
|
||||
|
||||
JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a
|
||||
surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content).
|
||||
|
||||
Borg deals with this situation like this (since borg 2.0):
|
||||
|
||||
For a valid unicode path (no surrogate escapes), the JSON will only have "path": path.
|
||||
|
||||
For a non-unicode path (with surrogate escapes), the JSON will have 2 entries:
|
||||
|
||||
- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?")
|
||||
- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string)
|
||||
|
||||
JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is:
|
||||
|
||||
- check if there is a "path_b64" key.
|
||||
- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has
|
||||
some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise
|
||||
path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte
|
||||
string you'll get. if an approximation is fine, use the value of the "path" key.
|
||||
- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding).
|
||||
|
||||
|
||||
Logging
|
||||
-------
|
||||
|
||||
|
@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot
|
|||
parsing error will be printed in plain text, because logging set-up happens after all arguments are
|
||||
parsed.
|
||||
|
||||
Since JSON can only encode text, any string representing a file system path may miss non-text parts.
|
||||
|
||||
The following types are in use. Progress information is governed by the usual rules for progress information,
|
||||
it is not produced unless ``--progress`` is specified.
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
from .platform import uid2user, user2uid, gid2group, group2gid
|
||||
from .helpers import parse_timestamp, archive_ts_now
|
||||
from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
|
||||
from .helpers import safe_encode, make_path_safe, remove_surrogates
|
||||
from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json
|
||||
from .helpers import StableDict
|
||||
from .helpers import bin_to_hex
|
||||
from .helpers import safe_ns
|
||||
|
@ -165,7 +165,8 @@ def show_progress(self, item=None, final=False, stream=None, dt=None):
|
|||
if self.output_json:
|
||||
if not final:
|
||||
data = self.as_dict()
|
||||
data["path"] = remove_surrogates(item.path if item else "")
|
||||
if item:
|
||||
data.update(text_to_json("path", item.path))
|
||||
else:
|
||||
data = {}
|
||||
data.update({"time": time.time(), "type": "archive_progress", "finished": final})
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
|
||||
from ..helpers import Error, set_ec
|
||||
from ..helpers import format_file_size
|
||||
from ..helpers import remove_surrogates
|
||||
from ..helpers import remove_surrogates, text_to_json
|
||||
from ..helpers import DatetimeWrapper, replace_placeholders
|
||||
from ..helpers import check_python, check_extension_modules
|
||||
from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
|
||||
|
@ -139,10 +139,9 @@ def print_file_status(self, status, path):
|
|||
# if we get called with status == None, the final file status was already printed
|
||||
if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
|
||||
if self.log_json:
|
||||
print(
|
||||
json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}),
|
||||
file=sys.stderr,
|
||||
)
|
||||
json_data = {"type": "file_status", "status": status}
|
||||
json_data.update(text_to_json("path", path))
|
||||
print(json.dumps(json_data), file=sys.stderr)
|
||||
else:
|
||||
logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))
|
||||
|
||||
|
|
|
@ -863,7 +863,7 @@ def create_master_idx(chunk_idx):
|
|||
)
|
||||
archive_ids_to_names = get_archive_ids_to_names(archive_ids)
|
||||
for archive_id, archive_name in archive_ids_to_names.items():
|
||||
pi.show(info=[remove_surrogates(archive_name)])
|
||||
pi.show(info=[remove_surrogates(archive_name)]) # legacy. borg2 always has pure unicode arch names.
|
||||
if self.do_cache:
|
||||
if archive_id in cached_ids:
|
||||
archive_chunk_idx = read_archive_index(archive_id, archive_name)
|
||||
|
|
|
@ -19,7 +19,8 @@
|
|||
from .misc import sysinfo, log_multi, consume
|
||||
from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
|
||||
from .parseformat import bin_to_hex, safe_encode, safe_decode
|
||||
from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
|
||||
from .parseformat import text_to_json, binary_to_json, remove_surrogates
|
||||
from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
|
||||
from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
|
||||
from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
|
||||
from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
|
@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
|
|||
return s.encode("utf-8", errors).decode("utf-8")
|
||||
|
||||
|
||||
def binary_to_json(key, value):
|
||||
assert isinstance(key, str)
|
||||
assert isinstance(value, bytes)
|
||||
return {key + "_b64": base64.b64encode(value).decode("ascii")}
|
||||
|
||||
|
||||
def text_to_json(key, value):
|
||||
"""
|
||||
Return a dict made from key/value that can be fed safely into a JSON encoder.
|
||||
|
||||
JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
|
||||
|
||||
But sometimes we have to deal with such values and we do it like this:
|
||||
- <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
|
||||
- <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
|
||||
"""
|
||||
coding = "utf-8"
|
||||
assert isinstance(key, str)
|
||||
assert isinstance(value, str) # str might contain surrogate escapes
|
||||
data = {}
|
||||
try:
|
||||
value.encode(coding, errors="strict") # check if pure unicode
|
||||
except UnicodeEncodeError:
|
||||
# value has surrogate escape sequences
|
||||
value_replace_encoded = value.encode(coding, errors="replace")
|
||||
data[key] = value_replace_encoded.decode(coding, errors="strict")
|
||||
value_bytes = value.encode(coding, errors="surrogateescape")
|
||||
data.update(binary_to_json(key, value_bytes))
|
||||
else:
|
||||
# value is pure unicode
|
||||
data[key] = value
|
||||
# we do not give the b64 representation, not needed
|
||||
return data
|
||||
|
||||
|
||||
def eval_escapes(s):
|
||||
"""Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
|
||||
return s.encode("ascii", "backslashreplace").decode("unicode-escape")
|
||||
|
@ -681,7 +717,7 @@ def __init__(self, format, repository, manifest, key, *, json=False, iec=False):
|
|||
self.call_keys = {
|
||||
"hostname": partial(self.get_meta, "hostname", rs=True),
|
||||
"username": partial(self.get_meta, "username", rs=True),
|
||||
"comment": partial(self.get_meta, "comment", rs=True),
|
||||
"comment": partial(self.get_meta, "comment", rs=False),
|
||||
"end": self.get_ts_end,
|
||||
"command_line": self.get_cmdline,
|
||||
}
|
||||
|
@ -702,8 +738,8 @@ def get_item_data(self, archive_info):
|
|||
item_data.update(self.item_data)
|
||||
item_data.update(
|
||||
{
|
||||
"name": remove_surrogates(archive_info.name),
|
||||
"archive": remove_surrogates(archive_info.name),
|
||||
"name": archive_info.name,
|
||||
"archive": archive_info.name,
|
||||
"id": bin_to_hex(archive_info.id),
|
||||
"time": self.format_time(archive_info.ts),
|
||||
"start": self.format_time(archive_info.ts),
|
||||
|
@ -840,31 +876,32 @@ def format_item_json(self, item):
|
|||
def get_item_data(self, item):
|
||||
item_data = {}
|
||||
item_data.update(self.item_data)
|
||||
mode = stat.filemode(item.mode)
|
||||
item_type = mode[0]
|
||||
|
||||
item_data.update(text_to_json("path", item.path))
|
||||
source = item.get("source", "")
|
||||
extra = ""
|
||||
if source:
|
||||
source = remove_surrogates(source)
|
||||
extra = " -> %s" % source
|
||||
item_data.update(text_to_json("source", source))
|
||||
item_data.update(text_to_json("linktarget", source))
|
||||
if not self.json_lines:
|
||||
item_data["extra"] = "" if not source else f" -> {item_data['source']}"
|
||||
|
||||
hlid = item.get("hlid")
|
||||
hlid = bin_to_hex(hlid) if hlid else ""
|
||||
item_data["hlid"] = hlid
|
||||
|
||||
mode = stat.filemode(item.mode)
|
||||
item_type = mode[0]
|
||||
item_data["type"] = item_type
|
||||
item_data["mode"] = mode
|
||||
item_data["user"] = item.get("user", str(item.uid))
|
||||
item_data["group"] = item.get("group", str(item.gid))
|
||||
|
||||
item_data.update(text_to_json("user", item.get("user", str(item.uid))))
|
||||
item_data.update(text_to_json("group", item.get("group", str(item.gid))))
|
||||
item_data["uid"] = item.uid
|
||||
item_data["gid"] = item.gid
|
||||
item_data["path"] = remove_surrogates(item.path)
|
||||
|
||||
if self.json_lines:
|
||||
item_data["healthy"] = "chunks_healthy" not in item
|
||||
else:
|
||||
item_data["extra"] = extra
|
||||
item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
|
||||
item_data["source"] = source
|
||||
item_data["linktarget"] = source
|
||||
item_data["hlid"] = hlid
|
||||
item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None
|
||||
for key in self.used_call_keys:
|
||||
item_data[key] = self.call_keys[key](item)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import base64
|
||||
import errno
|
||||
import getpass
|
||||
import hashlib
|
||||
|
@ -42,6 +43,7 @@
|
|||
from ..helpers import iter_separated
|
||||
from ..helpers import eval_escapes
|
||||
from ..helpers import safe_unlink
|
||||
from ..helpers import text_to_json, binary_to_json
|
||||
from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
|
||||
from ..platform import is_cygwin
|
||||
|
||||
|
@ -53,6 +55,46 @@ def test_bin_to_hex():
|
|||
assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key,value",
|
||||
[("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
|
||||
)
|
||||
def test_binary_to_json(key, value):
|
||||
key_b64 = key + "_b64"
|
||||
d = binary_to_json(key, value)
|
||||
assert key_b64 in d
|
||||
assert base64.b64decode(d[key_b64]) == value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key,value,strict",
|
||||
[
|
||||
("key", "abc", True),
|
||||
("key", "äöü", True),
|
||||
("key", "", True),
|
||||
("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
|
||||
("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
|
||||
],
|
||||
)
|
||||
def test_text_to_json(key, value, strict):
|
||||
key_b64 = key + "_b64"
|
||||
d = text_to_json(key, value)
|
||||
value_b = value.encode("utf-8", errors="surrogateescape")
|
||||
if strict:
|
||||
# no surrogate-escapes, just unicode text
|
||||
assert key in d
|
||||
assert d[key] == value_b.decode("utf-8", errors="strict")
|
||||
assert d[key].encode("utf-8", errors="strict") == value_b
|
||||
assert key_b64 not in d # not needed. pure valid unicode.
|
||||
else:
|
||||
# requiring surrogate-escapes. text has replacement chars, base64 representation is present.
|
||||
assert key in d
|
||||
assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
|
||||
assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
|
||||
assert key_b64 in d
|
||||
assert base64.b64decode(d[key_b64]) == value_b
|
||||
|
||||
|
||||
class TestLocationWithoutEnv:
|
||||
@pytest.fixture
|
||||
def keys_dir(self, tmpdir, monkeypatch):
|
||||
|
|
Loading…
Reference in a new issue