1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2025-02-26 16:04:06 +00:00

Merge pull request #7232 from ThomasWaldmann/json_b64

implement and use (text|binary)_to_json
This commit is contained in:
TW 2023-01-16 18:10:52 +01:00 committed by GitHub
commit d49665526c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 141 additions and 27 deletions

View file

@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale:
export LC_CTYPE=en_US.UTF-8
Dealing with non-unicode byte sequences and JSON limitations
------------------------------------------------------------
Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C).
Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems
still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often
have wild mixtures of misc. encodings, sometimes even very broken stuff.
borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using
UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points
(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence
will be reproduced exactly.
JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a
surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content).
Borg deals with this situation like this (since borg 2.0):
For a valid unicode path (no surrogate escapes), the JSON will only have "path": path.
For a non-unicode path (with surrogate escapes), the JSON will have 2 entries:
- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?")
- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string)
JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is:
- check if there is a "path_b64" key.
- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has
some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise
path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte
string you'll get. if an approximation is fine, use the value of the "path" key.
- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding).
Logging
-------
@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot
parsing error will be printed in plain text, because logging set-up happens after all arguments are
parsed.
Since JSON can only encode text, any string representing a file system path may miss non-text parts.
The following types are in use. Progress information is governed by the usual rules for progress information,
it is not produced unless ``--progress`` is specified.

View file

@ -32,7 +32,7 @@
from .platform import uid2user, user2uid, gid2group, group2gid
from .helpers import parse_timestamp, archive_ts_now
from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
from .helpers import safe_encode, make_path_safe, remove_surrogates
from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json
from .helpers import StableDict
from .helpers import bin_to_hex
from .helpers import safe_ns
@ -165,7 +165,8 @@ def show_progress(self, item=None, final=False, stream=None, dt=None):
if self.output_json:
if not final:
data = self.as_dict()
data["path"] = remove_surrogates(item.path if item else "")
if item:
data.update(text_to_json("path", item.path))
else:
data = {}
data.update({"time": time.time(), "type": "archive_progress", "finished": final})

View file

@ -26,7 +26,7 @@
from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
from ..helpers import Error, set_ec
from ..helpers import format_file_size
from ..helpers import remove_surrogates
from ..helpers import remove_surrogates, text_to_json
from ..helpers import DatetimeWrapper, replace_placeholders
from ..helpers import check_python, check_extension_modules
from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
@ -139,10 +139,9 @@ def print_file_status(self, status, path):
# if we get called with status == None, the final file status was already printed
if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
if self.log_json:
print(
json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}),
file=sys.stderr,
)
json_data = {"type": "file_status", "status": status}
json_data.update(text_to_json("path", path))
print(json.dumps(json_data), file=sys.stderr)
else:
logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))

View file

@ -863,7 +863,7 @@ def create_master_idx(chunk_idx):
)
archive_ids_to_names = get_archive_ids_to_names(archive_ids)
for archive_id, archive_name in archive_ids_to_names.items():
pi.show(info=[remove_surrogates(archive_name)])
pi.show(info=[remove_surrogates(archive_name)]) # legacy. borg2 always has pure unicode arch names.
if self.do_cache:
if archive_id in cached_ids:
archive_chunk_idx = read_archive_index(archive_id, archive_name)

View file

@ -19,7 +19,8 @@
from .misc import sysinfo, log_multi, consume
from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
from .parseformat import bin_to_hex, safe_encode, safe_decode
from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
from .parseformat import text_to_json, binary_to_json, remove_surrogates
from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal

View file

@ -1,4 +1,5 @@
import argparse
import base64
import hashlib
import json
import os
@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
return s.encode("utf-8", errors).decode("utf-8")
def binary_to_json(key, value):
assert isinstance(key, str)
assert isinstance(value, bytes)
return {key + "_b64": base64.b64encode(value).decode("ascii")}
def text_to_json(key, value):
"""
Return a dict made from key/value that can be fed safely into a JSON encoder.
JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
But sometimes we have to deal with such values and we do it like this:
- <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
- <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
"""
coding = "utf-8"
assert isinstance(key, str)
assert isinstance(value, str) # str might contain surrogate escapes
data = {}
try:
value.encode(coding, errors="strict") # check if pure unicode
except UnicodeEncodeError:
# value has surrogate escape sequences
value_replace_encoded = value.encode(coding, errors="replace")
data[key] = value_replace_encoded.decode(coding, errors="strict")
value_bytes = value.encode(coding, errors="surrogateescape")
data.update(binary_to_json(key, value_bytes))
else:
# value is pure unicode
data[key] = value
# we do not give the b64 representation, not needed
return data
def eval_escapes(s):
"""Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
return s.encode("ascii", "backslashreplace").decode("unicode-escape")
@ -681,7 +717,7 @@ def __init__(self, format, repository, manifest, key, *, json=False, iec=False):
self.call_keys = {
"hostname": partial(self.get_meta, "hostname", rs=True),
"username": partial(self.get_meta, "username", rs=True),
"comment": partial(self.get_meta, "comment", rs=True),
"comment": partial(self.get_meta, "comment", rs=False),
"end": self.get_ts_end,
"command_line": self.get_cmdline,
}
@ -702,8 +738,8 @@ def get_item_data(self, archive_info):
item_data.update(self.item_data)
item_data.update(
{
"name": remove_surrogates(archive_info.name),
"archive": remove_surrogates(archive_info.name),
"name": archive_info.name,
"archive": archive_info.name,
"id": bin_to_hex(archive_info.id),
"time": self.format_time(archive_info.ts),
"start": self.format_time(archive_info.ts),
@ -840,31 +876,32 @@ def format_item_json(self, item):
def get_item_data(self, item):
item_data = {}
item_data.update(self.item_data)
mode = stat.filemode(item.mode)
item_type = mode[0]
item_data.update(text_to_json("path", item.path))
source = item.get("source", "")
extra = ""
if source:
source = remove_surrogates(source)
extra = " -> %s" % source
item_data.update(text_to_json("source", source))
item_data.update(text_to_json("linktarget", source))
if not self.json_lines:
item_data["extra"] = "" if not source else f" -> {item_data['source']}"
hlid = item.get("hlid")
hlid = bin_to_hex(hlid) if hlid else ""
item_data["hlid"] = hlid
mode = stat.filemode(item.mode)
item_type = mode[0]
item_data["type"] = item_type
item_data["mode"] = mode
item_data["user"] = item.get("user", str(item.uid))
item_data["group"] = item.get("group", str(item.gid))
item_data.update(text_to_json("user", item.get("user", str(item.uid))))
item_data.update(text_to_json("group", item.get("group", str(item.gid))))
item_data["uid"] = item.uid
item_data["gid"] = item.gid
item_data["path"] = remove_surrogates(item.path)
if self.json_lines:
item_data["healthy"] = "chunks_healthy" not in item
else:
item_data["extra"] = extra
item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
item_data["source"] = source
item_data["linktarget"] = source
item_data["hlid"] = hlid
item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None
for key in self.used_call_keys:
item_data[key] = self.call_keys[key](item)

View file

@ -1,3 +1,4 @@
import base64
import errno
import getpass
import hashlib
@ -42,6 +43,7 @@
from ..helpers import iter_separated
from ..helpers import eval_escapes
from ..helpers import safe_unlink
from ..helpers import text_to_json, binary_to_json
from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
from ..platform import is_cygwin
@ -53,6 +55,46 @@ def test_bin_to_hex():
assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
@pytest.mark.parametrize(
"key,value",
[("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
)
def test_binary_to_json(key, value):
key_b64 = key + "_b64"
d = binary_to_json(key, value)
assert key_b64 in d
assert base64.b64decode(d[key_b64]) == value
@pytest.mark.parametrize(
"key,value,strict",
[
("key", "abc", True),
("key", "äöü", True),
("key", "", True),
("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
],
)
def test_text_to_json(key, value, strict):
key_b64 = key + "_b64"
d = text_to_json(key, value)
value_b = value.encode("utf-8", errors="surrogateescape")
if strict:
# no surrogate-escapes, just unicode text
assert key in d
assert d[key] == value_b.decode("utf-8", errors="strict")
assert d[key].encode("utf-8", errors="strict") == value_b
assert key_b64 not in d # not needed. pure valid unicode.
else:
# requiring surrogate-escapes. text has replacement chars, base64 representation is present.
assert key in d
assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
assert key_b64 in d
assert base64.b64decode(d[key_b64]) == value_b
class TestLocationWithoutEnv:
@pytest.fixture
def keys_dir(self, tmpdir, monkeypatch):