diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst index 42c6c67aa..7f2af1e5b 100644 --- a/docs/internals/frontends.rst +++ b/docs/internals/frontends.rst @@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale: export LC_CTYPE=en_US.UTF-8 +Dealing with non-unicode byte sequences and JSON limitations +------------------------------------------------------------ + +Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C). + +Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems +still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often +have wild mixtures of misc. encodings, sometimes even very broken stuff. + +borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using +UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points +(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence +will be reproduced exactly. + +JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a +surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content). + +Borg deals with this situation like this (since borg 2.0): + +For a valid unicode path (no surrogate escapes), the JSON will only have "path": path. + +For a non-unicode path (with surrogate escapes), the JSON will have 2 entries: + +- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?") +- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string) + +JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is: + +- check if there is a "path_b64" key. +- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has + some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise + path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte + string you'll get. if an approximation is fine, use the value of the "path" key. +- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding). + + Logging ------- @@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot parsing error will be printed in plain text, because logging set-up happens after all arguments are parsed. -Since JSON can only encode text, any string representing a file system path may miss non-text parts. - The following types are in use. Progress information is governed by the usual rules for progress information, it is not produced unless ``--progress`` is specified. diff --git a/src/borg/archive.py b/src/borg/archive.py index b581f785e..8a7564b6f 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -32,7 +32,7 @@ from .platform import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, archive_ts_now from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize -from .helpers import safe_encode, make_path_safe, remove_surrogates +from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json from .helpers import StableDict from .helpers import bin_to_hex from .helpers import safe_ns @@ -165,7 +165,8 @@ def show_progress(self, item=None, final=False, stream=None, dt=None): if self.output_json: if not final: data = self.as_dict() - data["path"] = remove_surrogates(item.path if item else "") + if item: + data.update(text_to_json("path", item.path)) else: data = {} data.update({"time": time.time(), "type": "archive_progress", "finished": final}) diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index 1549f2af7..ad74665aa 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -26,7 +26,7 @@ from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE from ..helpers import Error, set_ec from ..helpers import format_file_size - from ..helpers import remove_surrogates + from ..helpers import remove_surrogates, text_to_json from ..helpers import DatetimeWrapper, replace_placeholders from ..helpers import check_python, check_extension_modules from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo @@ -139,10 +139,9 @@ def print_file_status(self, status, path): # if we get called with status == None, the final file status was already printed if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter): if self.log_json: - print( - json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}), - file=sys.stderr, - ) + json_data = {"type": "file_status", "status": status} + json_data.update(text_to_json("path", path)) + print(json.dumps(json_data), file=sys.stderr) else: logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path)) diff --git a/src/borg/cache.py b/src/borg/cache.py index fc438d50a..90ded1e40 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -863,7 +863,7 @@ def create_master_idx(chunk_idx): ) archive_ids_to_names = get_archive_ids_to_names(archive_ids) for archive_id, archive_name in archive_ids_to_names.items(): - pi.show(info=[remove_surrogates(archive_name)]) + pi.show(info=[remove_surrogates(archive_name)]) # legacy. borg2 always has pure unicode arch names. if self.do_cache: if archive_id in cached_ids: archive_chunk_idx = read_archive_index(archive_id, archive_name) diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 36f6aa5c0..bef660894 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -19,7 +19,8 @@ from .misc import sysinfo, log_multi, consume from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper from .parseformat import bin_to_hex, safe_encode, safe_decode -from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval +from .parseformat import text_to_json, binary_to_json, remove_surrogates +from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 686431931..f1142871d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -1,4 +1,5 @@ import argparse +import base64 import hashlib import json import os @@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"): return s.encode("utf-8", errors).decode("utf-8") +def binary_to_json(key, value): + assert isinstance(key, str) + assert isinstance(value, bytes) + return {key + "_b64": base64.b64encode(value).decode("ascii")} + + +def text_to_json(key, value): + """ + Return a dict made from key/value that can be fed safely into a JSON encoder. + + JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes). + + But sometimes we have to deal with such values and we do it like this: + - : value as pure unicode text (surrogate escapes, if any, replaced by ?) + - _b64: value as base64 encoded binary representation (only set if value has surrogate-escapes) + """ + coding = "utf-8" + assert isinstance(key, str) + assert isinstance(value, str) # str might contain surrogate escapes + data = {} + try: + value.encode(coding, errors="strict") # check if pure unicode + except UnicodeEncodeError: + # value has surrogate escape sequences + value_replace_encoded = value.encode(coding, errors="replace") + data[key] = value_replace_encoded.decode(coding, errors="strict") + value_bytes = value.encode(coding, errors="surrogateescape") + data.update(binary_to_json(key, value_bytes)) + else: + # value is pure unicode + data[key] = value + # we do not give the b64 representation, not needed + return data + + def eval_escapes(s): """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`).""" return s.encode("ascii", "backslashreplace").decode("unicode-escape") @@ -681,7 +717,7 @@ def __init__(self, format, repository, manifest, key, *, json=False, iec=False): self.call_keys = { "hostname": partial(self.get_meta, "hostname", rs=True), "username": partial(self.get_meta, "username", rs=True), - "comment": partial(self.get_meta, "comment", rs=True), + "comment": partial(self.get_meta, "comment", rs=False), "end": self.get_ts_end, "command_line": self.get_cmdline, } @@ -702,8 +738,8 @@ def get_item_data(self, archive_info): item_data.update(self.item_data) item_data.update( { - "name": remove_surrogates(archive_info.name), - "archive": remove_surrogates(archive_info.name), + "name": archive_info.name, + "archive": archive_info.name, "id": bin_to_hex(archive_info.id), "time": self.format_time(archive_info.ts), "start": self.format_time(archive_info.ts), @@ -840,31 +876,32 @@ def format_item_json(self, item): def get_item_data(self, item): item_data = {} item_data.update(self.item_data) - mode = stat.filemode(item.mode) - item_type = mode[0] + item_data.update(text_to_json("path", item.path)) source = item.get("source", "") - extra = "" - if source: - source = remove_surrogates(source) - extra = " -> %s" % source + item_data.update(text_to_json("source", source)) + item_data.update(text_to_json("linktarget", source)) + if not self.json_lines: + item_data["extra"] = "" if not source else f" -> {item_data['source']}" + hlid = item.get("hlid") hlid = bin_to_hex(hlid) if hlid else "" + item_data["hlid"] = hlid + + mode = stat.filemode(item.mode) + item_type = mode[0] item_data["type"] = item_type item_data["mode"] = mode - item_data["user"] = item.get("user", str(item.uid)) - item_data["group"] = item.get("group", str(item.gid)) + + item_data.update(text_to_json("user", item.get("user", str(item.uid)))) + item_data.update(text_to_json("group", item.get("group", str(item.gid)))) item_data["uid"] = item.uid item_data["gid"] = item.gid - item_data["path"] = remove_surrogates(item.path) + if self.json_lines: item_data["healthy"] = "chunks_healthy" not in item else: - item_data["extra"] = extra item_data["health"] = "broken" if "chunks_healthy" in item else "healthy" - item_data["source"] = source - item_data["linktarget"] = source - item_data["hlid"] = hlid item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None for key in self.used_call_keys: item_data[key] = self.call_keys[key](item) diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index b8764b008..7b0830d9a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -1,3 +1,4 @@ +import base64 import errno import getpass import hashlib @@ -42,6 +43,7 @@ from ..helpers import iter_separated from ..helpers import eval_escapes from ..helpers import safe_unlink +from ..helpers import text_to_json, binary_to_json from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded from ..platform import is_cygwin @@ -53,6 +55,46 @@ def test_bin_to_hex(): assert bin_to_hex(b"\x00\x01\xff") == "0001ff" +@pytest.mark.parametrize( + "key,value", + [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")], +) +def test_binary_to_json(key, value): + key_b64 = key + "_b64" + d = binary_to_json(key, value) + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value + + +@pytest.mark.parametrize( + "key,value,strict", + [ + ("key", "abc", True), + ("key", "äöü", True), + ("key", "", True), + ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False), + ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False), + ], +) +def test_text_to_json(key, value, strict): + key_b64 = key + "_b64" + d = text_to_json(key, value) + value_b = value.encode("utf-8", errors="surrogateescape") + if strict: + # no surrogate-escapes, just unicode text + assert key in d + assert d[key] == value_b.decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value_b + assert key_b64 not in d # not needed. pure valid unicode. + else: + # requiring surrogate-escapes. text has replacement chars, base64 representation is present. + assert key in d + assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace") + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value_b + + class TestLocationWithoutEnv: @pytest.fixture def keys_dir(self, tmpdir, monkeypatch):