From 32d430a1b0e1909a042d6363b0c5cf9a92698471 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 29 Dec 2022 00:38:42 +0100 Subject: [PATCH 1/4] implement text_to_json / binary_to_json, see #6151 binary bytes: - json_key = _b64 - json_value == base64(value) text (potentially with surrogate escapes): - json_key1 = - json_value1 = value_text (s-e replaced by ?) - json_key2 = _b64 - json_value2 = base64(value_binary) json_key2/_value2 is only present if value_text required replacement of surrogate escapes (and thus does not represent the original value, but just an approximation). value_binary then gives the original bytes value (e.g. a non-utf8 bytes sequence). --- src/borg/helpers/__init__.py | 3 ++- src/borg/helpers/parseformat.py | 36 ++++++++++++++++++++++++++++ src/borg/testsuite/helpers.py | 42 +++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 36f6aa5c0..bef660894 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -19,7 +19,8 @@ from .misc import sysinfo, log_multi, consume from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper from .parseformat import bin_to_hex, safe_encode, safe_decode -from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval +from .parseformat import text_to_json, binary_to_json, remove_surrogates +from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 686431931..d9f8b22e6 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -1,4 +1,5 @@ import argparse +import base64 import hashlib import json import os @@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"): return s.encode("utf-8", errors).decode("utf-8") +def binary_to_json(key, value): + assert isinstance(key, str) + assert isinstance(value, bytes) + return {key + "_b64": base64.b64encode(value).decode("ascii")} + + +def text_to_json(key, value): + """ + Return a dict made from key/value that can be fed safely into a JSON encoder. + + JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes). + + But sometimes we have to deal with such values and we do it like this: + - : value as pure unicode text (surrogate escapes, if any, replaced by ?) + - _b64: value as base64 encoded binary representation (only set if value has surrogate-escapes) + """ + coding = "utf-8" + assert isinstance(key, str) + assert isinstance(value, str) # str might contain surrogate escapes + data = {} + try: + value.encode(coding, errors="strict") # check if pure unicode + except UnicodeEncodeError: + # value has surrogate escape sequences + value_replace_encoded = value.encode(coding, errors="replace") + data[key] = value_replace_encoded.decode(coding, errors="strict") + value_bytes = value.encode(coding, errors="surrogateescape") + data.update(binary_to_json(key, value_bytes)) + else: + # value is pure unicode + data[key] = value + # we do not give the b64 representation, not needed + return data + + def eval_escapes(s): """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`).""" return s.encode("ascii", "backslashreplace").decode("unicode-escape") diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index b8764b008..7b0830d9a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -1,3 +1,4 @@ +import base64 import errno import getpass import hashlib @@ -42,6 +43,7 @@ from ..helpers import iter_separated from ..helpers import eval_escapes from ..helpers import safe_unlink +from ..helpers import text_to_json, binary_to_json from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded from ..platform import is_cygwin @@ -53,6 +55,46 @@ def test_bin_to_hex(): assert bin_to_hex(b"\x00\x01\xff") == "0001ff" +@pytest.mark.parametrize( + "key,value", + [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")], +) +def test_binary_to_json(key, value): + key_b64 = key + "_b64" + d = binary_to_json(key, value) + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value + + +@pytest.mark.parametrize( + "key,value,strict", + [ + ("key", "abc", True), + ("key", "äöü", True), + ("key", "", True), + ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False), + ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False), + ], +) +def test_text_to_json(key, value, strict): + key_b64 = key + "_b64" + d = text_to_json(key, value) + value_b = value.encode("utf-8", errors="surrogateescape") + if strict: + # no surrogate-escapes, just unicode text + assert key in d + assert d[key] == value_b.decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value_b + assert key_b64 not in d # not needed. pure valid unicode. + else: + # requiring surrogate-escapes. text has replacement chars, base64 representation is present. + assert key in d + assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict") + assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace") + assert key_b64 in d + assert base64.b64decode(d[key_b64]) == value_b + + class TestLocationWithoutEnv: @pytest.fixture def keys_dir(self, tmpdir, monkeypatch): From e63cfcd70804be6eb62aa26d2895549bb78408ce Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 29 Dec 2022 20:03:46 +0100 Subject: [PATCH 2/4] json output: use text_to_json, fixes #6151 item: path, source, user, group for non-unicode stuff borg 1.2 had "bpath". now we have: path - unicode approximation (invalid stuff replaced by ?) path_b64 - base64(path_bytes) # only if needed source has the same issue as path and is now covered also. user and group are usually unicode or even pure ASCII, but we rather are cautious and cover them also. --- src/borg/archive.py | 5 +++-- src/borg/archiver/__init__.py | 9 ++++----- src/borg/helpers/parseformat.py | 27 ++++++++++++++------------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 3fb51b141..fc71ab06d 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -32,7 +32,7 @@ from .platform import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, archive_ts_now from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize -from .helpers import safe_encode, make_path_safe, remove_surrogates +from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json from .helpers import StableDict from .helpers import bin_to_hex from .helpers import safe_ns @@ -165,7 +165,8 @@ def show_progress(self, item=None, final=False, stream=None, dt=None): if self.output_json: if not final: data = self.as_dict() - data["path"] = remove_surrogates(item.path if item else "") + if item: + data.update(text_to_json("path", item.path)) else: data = {} data.update({"time": time.time(), "type": "archive_progress", "finished": final}) diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index 1549f2af7..ad74665aa 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -26,7 +26,7 @@ from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE from ..helpers import Error, set_ec from ..helpers import format_file_size - from ..helpers import remove_surrogates + from ..helpers import remove_surrogates, text_to_json from ..helpers import DatetimeWrapper, replace_placeholders from ..helpers import check_python, check_extension_modules from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo @@ -139,10 +139,9 @@ def print_file_status(self, status, path): # if we get called with status == None, the final file status was already printed if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter): if self.log_json: - print( - json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}), - file=sys.stderr, - ) + json_data = {"type": "file_status", "status": status} + json_data.update(text_to_json("path", path)) + print(json.dumps(json_data), file=sys.stderr) else: logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path)) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index d9f8b22e6..b08626b50 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -876,31 +876,32 @@ def format_item_json(self, item): def get_item_data(self, item): item_data = {} item_data.update(self.item_data) - mode = stat.filemode(item.mode) - item_type = mode[0] + item_data.update(text_to_json("path", item.path)) source = item.get("source", "") - extra = "" - if source: - source = remove_surrogates(source) - extra = " -> %s" % source + item_data.update(text_to_json("source", source)) + item_data.update(text_to_json("linktarget", source)) + if not self.json_lines: + item_data["extra"] = "" if not source else f" -> {item_data['source']}" + hlid = item.get("hlid") hlid = bin_to_hex(hlid) if hlid else "" + item_data["hlid"] = hlid + + mode = stat.filemode(item.mode) + item_type = mode[0] item_data["type"] = item_type item_data["mode"] = mode - item_data["user"] = item.get("user", str(item.uid)) - item_data["group"] = item.get("group", str(item.gid)) + + item_data.update(text_to_json("user", item.get("user", str(item.uid)))) + item_data.update(text_to_json("group", item.get("group", str(item.gid)))) item_data["uid"] = item.uid item_data["gid"] = item.gid - item_data["path"] = remove_surrogates(item.path) + if self.json_lines: item_data["healthy"] = "chunks_healthy" not in item else: - item_data["extra"] = extra item_data["health"] = "broken" if "chunks_healthy" in item else "healthy" - item_data["source"] = source - item_data["linktarget"] = source - item_data["hlid"] = hlid item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None for key in self.used_call_keys: item_data[key] = self.call_keys[key](item) From 8765e62bcd6be1675b64ad3ae4a721137145bf77 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 29 Dec 2022 22:16:33 +0100 Subject: [PATCH 3/4] document how borg deals with non-unicode bytes in JSON output --- docs/internals/frontends.rst | 38 ++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst index 42c6c67aa..7f2af1e5b 100644 --- a/docs/internals/frontends.rst +++ b/docs/internals/frontends.rst @@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale: export LC_CTYPE=en_US.UTF-8 +Dealing with non-unicode byte sequences and JSON limitations +------------------------------------------------------------ + +Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C). + +Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems +still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often +have wild mixtures of misc. encodings, sometimes even very broken stuff. + +borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using +UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points +(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence +will be reproduced exactly. + +JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a +surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content). + +Borg deals with this situation like this (since borg 2.0): + +For a valid unicode path (no surrogate escapes), the JSON will only have "path": path. + +For a non-unicode path (with surrogate escapes), the JSON will have 2 entries: + +- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?") +- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string) + +JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is: + +- check if there is a "path_b64" key. +- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has + some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise + path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte + string you'll get. if an approximation is fine, use the value of the "path" key. +- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding). + + Logging ------- @@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot parsing error will be printed in plain text, because logging set-up happens after all arguments are parsed. -Since JSON can only encode text, any string representing a file system path may miss non-text parts. - The following types are in use. Progress information is governed by the usual rules for progress information, it is not produced unless ``--progress`` is specified. From 491f898612e614d29e3b293282e6db1ea0de7427 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 8 Jan 2023 20:11:01 +0100 Subject: [PATCH 4/4] borg2 archive names and comments are always pure unicode --- src/borg/cache.py | 2 +- src/borg/helpers/parseformat.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/borg/cache.py b/src/borg/cache.py index fc438d50a..90ded1e40 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -863,7 +863,7 @@ def create_master_idx(chunk_idx): ) archive_ids_to_names = get_archive_ids_to_names(archive_ids) for archive_id, archive_name in archive_ids_to_names.items(): - pi.show(info=[remove_surrogates(archive_name)]) + pi.show(info=[remove_surrogates(archive_name)]) # legacy. borg2 always has pure unicode arch names. if self.do_cache: if archive_id in cached_ids: archive_chunk_idx = read_archive_index(archive_id, archive_name) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index b08626b50..f1142871d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -717,7 +717,7 @@ def __init__(self, format, repository, manifest, key, *, json=False, iec=False): self.call_keys = { "hostname": partial(self.get_meta, "hostname", rs=True), "username": partial(self.get_meta, "username", rs=True), - "comment": partial(self.get_meta, "comment", rs=True), + "comment": partial(self.get_meta, "comment", rs=False), "end": self.get_ts_end, "command_line": self.get_cmdline, } @@ -738,8 +738,8 @@ def get_item_data(self, archive_info): item_data.update(self.item_data) item_data.update( { - "name": remove_surrogates(archive_info.name), - "archive": remove_surrogates(archive_info.name), + "name": archive_info.name, + "archive": archive_info.name, "id": bin_to_hex(archive_info.id), "time": self.format_time(archive_info.ts), "start": self.format_time(archive_info.ts),