From 32d430a1b0e1909a042d6363b0c5cf9a92698471 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 29 Dec 2022 00:38:42 +0100
Subject: [PATCH 1/4] implement text_to_json / binary_to_json, see #6151

binary bytes:
- json_key = <key>_b64
- json_value == base64(value)

text (potentially with surrogate escapes):
- json_key1 = <key>
- json_value1 = value_text (s-e replaced by ?)
- json_key2 = <key>_b64
- json_value2 = base64(value_binary)

json_key2/_value2 is only present if value_text required
replacement of surrogate escapes (and thus does not represent
the original value, but just an approximation).
value_binary then gives the original bytes value (e.g. a
non-utf8 bytes sequence).
---
 src/borg/helpers/__init__.py    |  3 ++-
 src/borg/helpers/parseformat.py | 36 ++++++++++++++++++++++++++++
 src/borg/testsuite/helpers.py   | 42 +++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py
index 36f6aa5c0..bef660894 100644
--- a/src/borg/helpers/__init__.py
+++ b/src/borg/helpers/__init__.py
@@ -19,7 +19,8 @@
 from .misc import sysinfo, log_multi, consume
 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
 from .parseformat import bin_to_hex, safe_encode, safe_decode
-from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
+from .parseformat import text_to_json, binary_to_json, remove_surrogates
+from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index 686431931..d9f8b22e6 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -1,4 +1,5 @@
 import argparse
+import base64
 import hashlib
 import json
 import os
@@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
     return s.encode("utf-8", errors).decode("utf-8")
 
 
+def binary_to_json(key, value):
+    assert isinstance(key, str)
+    assert isinstance(value, bytes)
+    return {key + "_b64": base64.b64encode(value).decode("ascii")}
+
+
+def text_to_json(key, value):
+    """
+    Return a dict made from key/value that can be fed safely into a JSON encoder.
+
+    JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
+
+    But sometimes we have to deal with such values and we do it like this:
+    - <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
+    - <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
+    """
+    coding = "utf-8"
+    assert isinstance(key, str)
+    assert isinstance(value, str)  # str might contain surrogate escapes
+    data = {}
+    try:
+        value.encode(coding, errors="strict")  # check if pure unicode
+    except UnicodeEncodeError:
+        # value has surrogate escape sequences
+        value_replace_encoded = value.encode(coding, errors="replace")
+        data[key] = value_replace_encoded.decode(coding, errors="strict")
+        value_bytes = value.encode(coding, errors="surrogateescape")
+        data.update(binary_to_json(key, value_bytes))
+    else:
+        # value is pure unicode
+        data[key] = value
+        # we do not give the b64 representation, not needed
+    return data
+
+
 def eval_escapes(s):
     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
     return s.encode("ascii", "backslashreplace").decode("unicode-escape")
diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py
index b8764b008..7b0830d9a 100644
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -1,3 +1,4 @@
+import base64
 import errno
 import getpass
 import hashlib
@@ -42,6 +43,7 @@
 from ..helpers import iter_separated
 from ..helpers import eval_escapes
 from ..helpers import safe_unlink
+from ..helpers import text_to_json, binary_to_json
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from ..platform import is_cygwin
 
@@ -53,6 +55,46 @@ def test_bin_to_hex():
     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
 
 
+@pytest.mark.parametrize(
+    "key,value",
+    [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
+)
+def test_binary_to_json(key, value):
+    key_b64 = key + "_b64"
+    d = binary_to_json(key, value)
+    assert key_b64 in d
+    assert base64.b64decode(d[key_b64]) == value
+
+
+@pytest.mark.parametrize(
+    "key,value,strict",
+    [
+        ("key", "abc", True),
+        ("key", "äöü", True),
+        ("key", "", True),
+        ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
+        ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
+    ],
+)
+def test_text_to_json(key, value, strict):
+    key_b64 = key + "_b64"
+    d = text_to_json(key, value)
+    value_b = value.encode("utf-8", errors="surrogateescape")
+    if strict:
+        # no surrogate-escapes, just unicode text
+        assert key in d
+        assert d[key] == value_b.decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value_b
+        assert key_b64 not in d  # not needed. pure valid unicode.
+    else:
+        # requiring surrogate-escapes. text has replacement chars, base64 representation is present.
+        assert key in d
+        assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
+        assert key_b64 in d
+        assert base64.b64decode(d[key_b64]) == value_b
+
+
 class TestLocationWithoutEnv:
     @pytest.fixture
     def keys_dir(self, tmpdir, monkeypatch):

From e63cfcd70804be6eb62aa26d2895549bb78408ce Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 29 Dec 2022 20:03:46 +0100
Subject: [PATCH 2/4] json output: use text_to_json, fixes #6151

item: path, source, user, group

for non-unicode stuff borg 1.2 had "bpath".

now we have:
path - unicode approximation (invalid stuff replaced by ?)
path_b64 - base64(path_bytes)  # only if needed

source has the same issue as path and is now covered also.

user and group are usually unicode or even pure ASCII,
but we rather are cautious and cover them also.
---
 src/borg/archive.py             |  5 +++--
 src/borg/archiver/__init__.py   |  9 ++++-----
 src/borg/helpers/parseformat.py | 27 ++++++++++++++-------------
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 3fb51b141..fc71ab06d 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -32,7 +32,7 @@
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .helpers import parse_timestamp, archive_ts_now
 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
-from .helpers import safe_encode, make_path_safe, remove_surrogates
+from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json
 from .helpers import StableDict
 from .helpers import bin_to_hex
 from .helpers import safe_ns
@@ -165,7 +165,8 @@ def show_progress(self, item=None, final=False, stream=None, dt=None):
             if self.output_json:
                 if not final:
                     data = self.as_dict()
-                    data["path"] = remove_surrogates(item.path if item else "")
+                    if item:
+                        data.update(text_to_json("path", item.path))
                 else:
                     data = {}
                 data.update({"time": time.time(), "type": "archive_progress", "finished": final})
diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py
index 1549f2af7..ad74665aa 100644
--- a/src/borg/archiver/__init__.py
+++ b/src/borg/archiver/__init__.py
@@ -26,7 +26,7 @@
     from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
     from ..helpers import Error, set_ec
     from ..helpers import format_file_size
-    from ..helpers import remove_surrogates
+    from ..helpers import remove_surrogates, text_to_json
     from ..helpers import DatetimeWrapper, replace_placeholders
     from ..helpers import check_python, check_extension_modules
     from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
@@ -139,10 +139,9 @@ def print_file_status(self, status, path):
         # if we get called with status == None, the final file status was already printed
         if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
             if self.log_json:
-                print(
-                    json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}),
-                    file=sys.stderr,
-                )
+                json_data = {"type": "file_status", "status": status}
+                json_data.update(text_to_json("path", path))
+                print(json.dumps(json_data), file=sys.stderr)
             else:
                 logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))
 
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index d9f8b22e6..b08626b50 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -876,31 +876,32 @@ def format_item_json(self, item):
     def get_item_data(self, item):
         item_data = {}
         item_data.update(self.item_data)
-        mode = stat.filemode(item.mode)
-        item_type = mode[0]
 
+        item_data.update(text_to_json("path", item.path))
         source = item.get("source", "")
-        extra = ""
-        if source:
-            source = remove_surrogates(source)
-            extra = " -> %s" % source
+        item_data.update(text_to_json("source", source))
+        item_data.update(text_to_json("linktarget", source))
+        if not self.json_lines:
+            item_data["extra"] = "" if not source else f" -> {item_data['source']}"
+
         hlid = item.get("hlid")
         hlid = bin_to_hex(hlid) if hlid else ""
+        item_data["hlid"] = hlid
+
+        mode = stat.filemode(item.mode)
+        item_type = mode[0]
         item_data["type"] = item_type
         item_data["mode"] = mode
-        item_data["user"] = item.get("user", str(item.uid))
-        item_data["group"] = item.get("group", str(item.gid))
+
+        item_data.update(text_to_json("user", item.get("user", str(item.uid))))
+        item_data.update(text_to_json("group", item.get("group", str(item.gid))))
         item_data["uid"] = item.uid
         item_data["gid"] = item.gid
-        item_data["path"] = remove_surrogates(item.path)
+
         if self.json_lines:
             item_data["healthy"] = "chunks_healthy" not in item
         else:
-            item_data["extra"] = extra
             item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
-        item_data["source"] = source
-        item_data["linktarget"] = source
-        item_data["hlid"] = hlid
         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
         for key in self.used_call_keys:
             item_data[key] = self.call_keys[key](item)

From 8765e62bcd6be1675b64ad3ae4a721137145bf77 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Thu, 29 Dec 2022 22:16:33 +0100
Subject: [PATCH 3/4] document how borg deals with non-unicode bytes in JSON
 output

---
 docs/internals/frontends.rst | 38 ++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst
index 42c6c67aa..7f2af1e5b 100644
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale:
     export LC_CTYPE=en_US.UTF-8
 
 
+Dealing with non-unicode byte sequences and JSON limitations
+------------------------------------------------------------
+
+Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C).
+
+Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems
+still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often
+have wild mixtures of misc. encodings, sometimes even very broken stuff.
+
+borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using
+UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points
+(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence
+will be reproduced exactly.
+
+JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a
+surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content).
+
+Borg deals with this situation like this (since borg 2.0):
+
+For a valid unicode path (no surrogate escapes), the JSON will only have "path": path.
+
+For a non-unicode path (with surrogate escapes), the JSON will have 2 entries:
+
+- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?")
+- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string)
+
+JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is:
+
+- check if there is a "path_b64" key.
+- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has
+  some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise
+  path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte
+  string you'll get. if an approximation is fine, use the value of the "path" key.
+- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding).
+
+
 Logging
 -------
 
@@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot
     parsing error will be printed in plain text, because logging set-up happens after all arguments are
     parsed.
 
-Since JSON can only encode text, any string representing a file system path may miss non-text parts.
-
 The following types are in use. Progress information is governed by the usual rules for progress information,
 it is not produced unless ``--progress`` is specified.
 

From 491f898612e614d29e3b293282e6db1ea0de7427 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sun, 8 Jan 2023 20:11:01 +0100
Subject: [PATCH 4/4] borg2 archive names and comments are always pure unicode

---
 src/borg/cache.py               | 2 +-
 src/borg/helpers/parseformat.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/borg/cache.py b/src/borg/cache.py
index fc438d50a..90ded1e40 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -863,7 +863,7 @@ def create_master_idx(chunk_idx):
                 )
                 archive_ids_to_names = get_archive_ids_to_names(archive_ids)
                 for archive_id, archive_name in archive_ids_to_names.items():
-                    pi.show(info=[remove_surrogates(archive_name)])
+                    pi.show(info=[remove_surrogates(archive_name)])  # legacy. borg2 always has pure unicode arch names.
                     if self.do_cache:
                         if archive_id in cached_ids:
                             archive_chunk_idx = read_archive_index(archive_id, archive_name)
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index b08626b50..f1142871d 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -717,7 +717,7 @@ def __init__(self, format, repository, manifest, key, *, json=False, iec=False):
         self.call_keys = {
             "hostname": partial(self.get_meta, "hostname", rs=True),
             "username": partial(self.get_meta, "username", rs=True),
-            "comment": partial(self.get_meta, "comment", rs=True),
+            "comment": partial(self.get_meta, "comment", rs=False),
             "end": self.get_ts_end,
             "command_line": self.get_cmdline,
         }
@@ -738,8 +738,8 @@ def get_item_data(self, archive_info):
         item_data.update(self.item_data)
         item_data.update(
             {
-                "name": remove_surrogates(archive_info.name),
-                "archive": remove_surrogates(archive_info.name),
+                "name": archive_info.name,
+                "archive": archive_info.name,
                 "id": bin_to_hex(archive_info.id),
                 "time": self.format_time(archive_info.ts),
                 "start": self.format_time(archive_info.ts),