mirror of https://github.com/morpheus65535/bazarr
738 lines
27 KiB
Python
738 lines
27 KiB
Python
"""
|
||
ftfy: fixes text for you
|
||
|
||
This is a module for making text less broken. See the `fix_text` function
|
||
for more information.
|
||
"""
|
||
|
||
import unicodedata
|
||
import warnings
|
||
from typing import List, NamedTuple, Optional, Tuple, Union, no_type_check
|
||
|
||
from ftfy import bad_codecs
|
||
from ftfy import chardata, fixes
|
||
from ftfy.badness import is_bad
|
||
from ftfy.formatting import display_ljust
|
||
|
||
__version__ = "6.1.1"
|
||
|
||
|
||
# Though this function does nothing, it lets linters know that we're using
|
||
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
|
||
bad_codecs.ok()
|
||
|
||
|
||
class ExplainedText(NamedTuple):
|
||
"""
|
||
The return type from ftfy's functions that provide an "explanation" of which
|
||
steps it applied to fix the text, such as :func:`fix_and_explain()`.
|
||
|
||
When the 'explain' option is disabled, these functions return the same
|
||
type, but the `explanation` will be None.
|
||
"""
|
||
|
||
text: str
|
||
explanation: Optional[List[Tuple[str, str]]]
|
||
|
||
|
||
class TextFixerConfig(NamedTuple):
|
||
r"""
|
||
A TextFixerConfig object stores configuration options for ftfy.
|
||
|
||
It's implemented as a namedtuple with defaults, so you can instantiate
|
||
it by providing the values to change from their defaults as keyword arguments.
|
||
For example, to disable 'unescape_html' and keep the rest of the defaults::
|
||
|
||
TextFixerConfig(unescape_html=False)
|
||
|
||
Here are the options and their default values:
|
||
|
||
- `unescape_html`: "auto"
|
||
|
||
Configures whether to replace HTML entities such as & with the character
|
||
they represent. "auto" says to do this by default, but disable it when a
|
||
literal < character appears, indicating that the input is actual HTML and
|
||
entities should be preserved. The value can be True, to always enable this
|
||
fixer, or False, to always disable it.
|
||
|
||
- `remove_terminal_escapes`: True
|
||
|
||
Removes "ANSI" terminal escapes, such as for changing the color of text in a
|
||
terminal window.
|
||
|
||
- `fix_encoding`: True
|
||
|
||
Detect mojibake and attempt to fix it by decoding the text in a different
|
||
encoding standard.
|
||
|
||
The following four options affect `fix_encoding` works, and do nothing if
|
||
`fix_encoding` is False:
|
||
|
||
- `restore_byte_a0`: True
|
||
|
||
Allow a literal space (U+20) to be interpreted as a non-breaking space
|
||
(U+A0) when that would make it part of a fixable mojibake string.
|
||
|
||
Because spaces are very common characters, this could lead to false
|
||
positives, but we try to apply it only when there's strong evidence for
|
||
mojibake. Disabling `restore_byte_a0` is safer from false positives,
|
||
but creates false negatives.
|
||
|
||
- `replace_lossy_sequences`: True
|
||
|
||
Detect mojibake that has been partially replaced by the characters
|
||
'<EFBFBD>' or '?'. If the mojibake could be decoded otherwise, replace the
|
||
detected sequence with '<EFBFBD>'.
|
||
|
||
- `decode_inconsistent_utf8`: True
|
||
|
||
When we see sequences that distinctly look like UTF-8 mojibake, but
|
||
there's no consistent way to reinterpret the string in a new encoding,
|
||
replace the mojibake with the appropriate UTF-8 characters anyway.
|
||
|
||
This helps to decode strings that are concatenated from different
|
||
encodings.
|
||
|
||
- `fix_c1_controls`: True
|
||
|
||
Replace C1 control characters (the useless characters U+80 - U+9B that
|
||
come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
|
||
even if the whole string doesn't decode as Latin-1.
|
||
|
||
- `fix_latin_ligatures`: True
|
||
|
||
Replace common Latin-alphabet ligatures, such as ``fi``, with the
|
||
letters they're made of.
|
||
|
||
- `fix_character_width`: True
|
||
|
||
Replace fullwidth Latin characters and halfwidth Katakana with
|
||
their more standard widths.
|
||
|
||
- `uncurl_quotes`: True
|
||
|
||
Replace curly quotes with straight quotes.
|
||
|
||
- `fix_line_breaks`: True
|
||
|
||
Replace various forms of line breaks with the standard Unix line
|
||
break, ``\n``.
|
||
|
||
- `fix_surrogates`: True
|
||
|
||
Replace sequences of UTF-16 surrogate codepoints with the character
|
||
they were meant to encode. This fixes text that was decoded with the
|
||
obsolete UCS-2 standard, and allows it to support high-numbered
|
||
codepoints such as emoji.
|
||
|
||
- `remove_control_chars`: True
|
||
|
||
Remove certain control characters that have no displayed effect on text.
|
||
|
||
- `normalization`: "NFC"
|
||
|
||
Choose what kind of Unicode normalization is applied. Usually, we apply
|
||
NFC normalization, so that letters followed by combining characters become
|
||
single combined characters.
|
||
|
||
Changing this to "NFKC" applies more compatibility conversions, such as
|
||
replacing the 'micro sign' with a standard Greek lowercase mu, which looks
|
||
identical. However, some NFKC normalizations change the meaning of text,
|
||
such as converting "10³" to "103".
|
||
|
||
`normalization` can be None, to apply no normalization.
|
||
|
||
- `max_decode_length`: 1_000_000
|
||
|
||
The maximum size of "segment" that ftfy will try to fix all at once.
|
||
|
||
- `explain`: True
|
||
|
||
Whether to compute 'explanations', lists describing what ftfy changed.
|
||
When this is False, the explanation will be None, and the code that
|
||
builds the explanation will be skipped, possibly saving time.
|
||
|
||
Functions that accept TextFixerConfig and don't return an explanation
|
||
will automatically set `explain` to False.
|
||
"""
|
||
unescape_html: Union[str, bool] = "auto"
|
||
remove_terminal_escapes: bool = True
|
||
fix_encoding: bool = True
|
||
restore_byte_a0: bool = True
|
||
replace_lossy_sequences: bool = True
|
||
decode_inconsistent_utf8: bool = True
|
||
fix_c1_controls: bool = True
|
||
fix_latin_ligatures: bool = True
|
||
fix_character_width: bool = True
|
||
uncurl_quotes: bool = True
|
||
fix_line_breaks: bool = True
|
||
fix_surrogates: bool = True
|
||
remove_control_chars: bool = True
|
||
normalization: Optional[str] = "NFC"
|
||
max_decode_length: int = 1000000
|
||
explain: bool = True
|
||
|
||
|
||
def _config_from_kwargs(config: TextFixerConfig, kwargs: dict) -> TextFixerConfig:
|
||
"""
|
||
Handle parameters provided as keyword arguments to ftfy's top-level
|
||
functions, converting them into a TextFixerConfig.
|
||
"""
|
||
if "fix_entities" in kwargs:
|
||
warnings.warn(
|
||
"`fix_entities` has been renamed to `unescape_html`", DeprecationWarning
|
||
)
|
||
kwargs = kwargs.copy()
|
||
kwargs["unescape_html"] = kwargs["fix_entities"]
|
||
del kwargs["fix_entities"]
|
||
config = config._replace(**kwargs)
|
||
return config
|
||
|
||
|
||
FIXERS = {
|
||
"unescape_html": fixes.unescape_html,
|
||
"remove_terminal_escapes": fixes.remove_terminal_escapes,
|
||
"restore_byte_a0": fixes.restore_byte_a0,
|
||
"replace_lossy_sequences": fixes.replace_lossy_sequences,
|
||
"decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
|
||
"fix_c1_controls": fixes.fix_c1_controls,
|
||
"fix_latin_ligatures": fixes.fix_latin_ligatures,
|
||
"fix_character_width": fixes.fix_character_width,
|
||
"uncurl_quotes": fixes.uncurl_quotes,
|
||
"fix_line_breaks": fixes.fix_line_breaks,
|
||
"fix_surrogates": fixes.fix_surrogates,
|
||
"remove_control_chars": fixes.remove_control_chars,
|
||
}
|
||
|
||
|
||
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
|
||
|
||
ftfy is designed to fix problems with text. Treating bytes like they're
|
||
interchangeable with Unicode text is usually something that introduces
|
||
problems with text.
|
||
|
||
You should first decode these bytes from the encoding you think they're in.
|
||
If you're not sure what encoding they're in:
|
||
|
||
- First, try to find out. 'utf-8' is a good assumption.
|
||
- If the encoding is simply unknowable, try running your bytes through
|
||
ftfy.guess_bytes. As the name implies, this may not always be accurate.
|
||
|
||
For more information on the distinction between bytes and text, read the
|
||
Python Unicode HOWTO:
|
||
|
||
http://docs.python.org/3/howto/unicode.html
|
||
"""
|
||
|
||
|
||
def _try_fix(
|
||
fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
|
||
) -> str:
|
||
"""
|
||
A helper function used across several 'fixer' steps, deciding whether to
|
||
apply the fix and whether to record the fix in `steps`.
|
||
"""
|
||
if getattr(config, fixer_name):
|
||
fixer = FIXERS[fixer_name]
|
||
fixed = fixer(text)
|
||
if steps is not None and fixed != text:
|
||
steps.append(("apply", fixer_name))
|
||
return fixed
|
||
|
||
return text
|
||
|
||
|
||
def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
|
||
r"""
|
||
Given Unicode text as input, fix inconsistencies and glitches in it,
|
||
such as mojibake (text that was decoded in the wrong encoding).
|
||
|
||
Let's start with some examples:
|
||
|
||
>>> fix_text('✔ No problems')
|
||
'✔ No problems'
|
||
|
||
>>> print(fix_text("¯\\_(ã\x83\x84)_/¯"))
|
||
¯\_(ツ)_/¯
|
||
|
||
>>> fix_text('Broken text… it’s flubberific!')
|
||
"Broken text... it's flubberific!"
|
||
|
||
>>> fix_text('LOUD NOISES')
|
||
'LOUD NOISES'
|
||
|
||
ftfy applies a number of different fixes to the text, and can accept
|
||
configuration to select which fixes to apply.
|
||
|
||
The configuration takes the form of a :class:`TextFixerConfig` object,
|
||
and you can see a description of the options in that class's docstring
|
||
or in the full documentation at ftfy.readthedocs.org.
|
||
|
||
For convenience and backward compatibility, the configuration can also
|
||
take the form of keyword arguments, which will set the equivalently-named
|
||
fields of the TextFixerConfig object.
|
||
|
||
For example, here are two ways to fix text but skip the "uncurl_quotes"
|
||
step::
|
||
|
||
fix_text(text, TextFixerConfig(uncurl_quotes=False))
|
||
fix_text(text, uncurl_quotes=False)
|
||
|
||
This function fixes text in independent segments, which are usually lines
|
||
of text, or arbitrarily broken up every 1 million codepoints (configurable
|
||
with `config.max_decode_length`) if there aren't enough line breaks. The
|
||
bound on segment lengths helps to avoid unbounded slowdowns.
|
||
|
||
ftfy can also provide an 'explanation', a list of transformations it applied
|
||
to the text that would fix more text like it. This function doesn't provide
|
||
explanations (because there may be different fixes for different segments
|
||
of text).
|
||
|
||
To get an explanation, use the :func:`fix_and_explain()` function, which
|
||
fixes the string in one segment and explains what it fixed.
|
||
"""
|
||
|
||
if config is None:
|
||
config = TextFixerConfig(explain=False)
|
||
config = _config_from_kwargs(config, kwargs)
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(BYTES_ERROR_TEXT)
|
||
|
||
out = []
|
||
pos = 0
|
||
while pos < len(text):
|
||
textbreak = text.find("\n", pos) + 1
|
||
if textbreak == 0:
|
||
textbreak = len(text)
|
||
if (textbreak - pos) > config.max_decode_length:
|
||
textbreak = pos + config.max_decode_length
|
||
|
||
segment = text[pos:textbreak]
|
||
if config.unescape_html == "auto" and "<" in segment:
|
||
config = config._replace(unescape_html=False)
|
||
fixed_segment, _ = fix_and_explain(segment, config)
|
||
out.append(fixed_segment)
|
||
pos = textbreak
|
||
return "".join(out)
|
||
|
||
|
||
def fix_and_explain(
|
||
text: str, config: Optional[TextFixerConfig] = None, **kwargs
|
||
) -> ExplainedText:
|
||
"""
|
||
Fix text as a single segment, returning the fixed text and an explanation
|
||
of what was fixed.
|
||
|
||
The explanation is a list of steps that can be applied with
|
||
:func:`apply_plan`, or if config.explain is False, it will be None.
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig()
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(BYTES_ERROR_TEXT)
|
||
config = _config_from_kwargs(config, kwargs)
|
||
|
||
if config.unescape_html == "auto" and "<" in text:
|
||
config = config._replace(unescape_html=False)
|
||
|
||
if config.explain:
|
||
steps: Optional[List[Tuple[str, str]]] = []
|
||
else:
|
||
# If explanations aren't desired, `steps` will be None
|
||
steps = None
|
||
|
||
while True:
|
||
origtext = text
|
||
|
||
text = _try_fix("unescape_html", text, config, steps)
|
||
|
||
if config.fix_encoding:
|
||
if steps is None:
|
||
text = fix_encoding(text)
|
||
else:
|
||
text, encoding_steps = fix_encoding_and_explain(text, config)
|
||
if encoding_steps is not None:
|
||
steps.extend(encoding_steps)
|
||
|
||
for fixer in [
|
||
"fix_c1_controls",
|
||
"fix_latin_ligatures",
|
||
"fix_character_width",
|
||
"uncurl_quotes",
|
||
"fix_line_breaks",
|
||
"fix_surrogates",
|
||
"remove_terminal_escapes",
|
||
"remove_control_chars",
|
||
]:
|
||
text = _try_fix(fixer, text, config, steps)
|
||
|
||
if config.normalization is not None:
|
||
fixed = unicodedata.normalize(config.normalization, text)
|
||
if steps is not None and fixed != text:
|
||
steps.append(("normalize", config.normalization))
|
||
text = fixed
|
||
|
||
if text == origtext:
|
||
return ExplainedText(text, steps)
|
||
|
||
|
||
def fix_encoding_and_explain(
|
||
text: str, config: Optional[TextFixerConfig] = None, **kwargs
|
||
) -> ExplainedText:
|
||
"""
|
||
Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
|
||
text and a list explaining what was fixed.
|
||
|
||
This includes fixing text by encoding and decoding it in different encodings,
|
||
as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
|
||
`decode_inconsistent_utf8`, and `fix_c1_controls`.
|
||
|
||
Examples::
|
||
|
||
>>> fix_encoding_and_explain("só")
|
||
ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
|
||
|
||
>>> result = fix_encoding_and_explain("voilà le travail")
|
||
>>> result.text
|
||
'voilà le travail'
|
||
>>> result.explanation
|
||
[('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
|
||
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig()
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(BYTES_ERROR_TEXT)
|
||
config = _config_from_kwargs(config, kwargs)
|
||
|
||
if not config.fix_encoding:
|
||
# A weird trivial case: we're asked to fix the encoding, but skip
|
||
# fixing the encoding
|
||
return ExplainedText(text, [])
|
||
|
||
plan_so_far: List[Tuple[str, str]] = []
|
||
while True:
|
||
prevtext = text
|
||
text, plan = _fix_encoding_one_step_and_explain(text, config)
|
||
if plan is not None:
|
||
plan_so_far.extend(plan)
|
||
if text == prevtext:
|
||
return ExplainedText(text, plan_so_far)
|
||
|
||
|
||
def _fix_encoding_one_step_and_explain(
|
||
text: str, config: TextFixerConfig
|
||
) -> ExplainedText:
|
||
"""
|
||
Perform one step of fixing the encoding of text.
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig()
|
||
|
||
if len(text) == 0:
|
||
return ExplainedText(text, [])
|
||
|
||
# The first plan is to return ASCII text unchanged, as well as text
|
||
# that doesn't look like it contains mojibake
|
||
if chardata.possible_encoding(text, "ascii") or not is_bad(text):
|
||
return ExplainedText(text, [])
|
||
|
||
# As we go through the next step, remember the possible encodings
|
||
# that we encounter but don't successfully fix yet. We may need them
|
||
# later.
|
||
possible_1byte_encodings = []
|
||
|
||
# Suppose the text was supposed to be UTF-8, but it was decoded using
|
||
# a single-byte encoding instead. When these cases can be fixed, they
|
||
# are usually the correct thing to do, so try them next.
|
||
for encoding in chardata.CHARMAP_ENCODINGS:
|
||
if chardata.possible_encoding(text, encoding):
|
||
possible_1byte_encodings.append(encoding)
|
||
encoded_bytes = text.encode(encoding)
|
||
encode_step = ("encode", encoding)
|
||
transcode_steps = []
|
||
|
||
# Now, find out if it's UTF-8 (or close enough). Otherwise,
|
||
# remember the encoding for later.
|
||
try:
|
||
decoding = "utf-8"
|
||
# Check encoded_bytes for sequences that would be UTF-8,
|
||
# except they have b' ' where b'\xa0' would belong.
|
||
if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
|
||
encoded_bytes
|
||
):
|
||
replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
|
||
if replaced_bytes != encoded_bytes:
|
||
transcode_steps.append(("transcode", "restore_byte_a0"))
|
||
encoded_bytes = replaced_bytes
|
||
|
||
# Replace sequences where information has been lost
|
||
if config.replace_lossy_sequences and encoding.startswith("sloppy"):
|
||
replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
|
||
if replaced_bytes != encoded_bytes:
|
||
transcode_steps.append(("transcode", "replace_lossy_sequences"))
|
||
encoded_bytes = replaced_bytes
|
||
|
||
if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
|
||
decoding = "utf-8-variants"
|
||
|
||
decode_step = ("decode", decoding)
|
||
steps = [encode_step] + transcode_steps + [decode_step]
|
||
fixed = encoded_bytes.decode(decoding)
|
||
return ExplainedText(fixed, steps)
|
||
|
||
except UnicodeDecodeError:
|
||
pass
|
||
|
||
# Look for a-hat-euro sequences that remain, and fix them in isolation.
|
||
if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
|
||
steps = [("apply", "decode_inconsistent_utf8")]
|
||
fixed = fixes.decode_inconsistent_utf8(text)
|
||
if fixed != text:
|
||
return ExplainedText(fixed, steps)
|
||
|
||
# The next most likely case is that this is Latin-1 that was intended to
|
||
# be read as Windows-1252, because those two encodings in particular are
|
||
# easily confused.
|
||
if "latin-1" in possible_1byte_encodings:
|
||
if "windows-1252" in possible_1byte_encodings:
|
||
# This text is in the intersection of Latin-1 and
|
||
# Windows-1252, so it's probably legit.
|
||
return ExplainedText(text, [])
|
||
else:
|
||
# Otherwise, it means we have characters that are in Latin-1 but
|
||
# not in Windows-1252. Those are C1 control characters. Nobody
|
||
# wants those. Assume they were meant to be Windows-1252.
|
||
try:
|
||
fixed = text.encode("latin-1").decode("windows-1252")
|
||
if fixed != text:
|
||
steps = [("encode", "latin-1"), ("decode", "windows-1252")]
|
||
return ExplainedText(fixed, steps)
|
||
except UnicodeDecodeError:
|
||
pass
|
||
|
||
# Fix individual characters of Latin-1 with a less satisfying explanation
|
||
if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
|
||
steps = [("transcode", "fix_c1_controls")]
|
||
fixed = fixes.fix_c1_controls(text)
|
||
return ExplainedText(fixed, steps)
|
||
|
||
# The cases that remain are mixups between two different single-byte
|
||
# encodings, and not the common case of Latin-1 vs. Windows-1252.
|
||
#
|
||
# With the new heuristic in 6.0, it's possible that we're closer to solving
|
||
# these in some cases. It would require a lot of testing and tuning, though.
|
||
# For now, we leave the text unchanged in these cases.
|
||
return ExplainedText(text, [])
|
||
|
||
|
||
def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
|
||
"""
|
||
Apply just the encoding-fixing steps of ftfy to this text. Returns the
|
||
fixed text, discarding the explanation.
|
||
|
||
>>> fix_encoding("ó")
|
||
'ó'
|
||
>>> fix_encoding("&ATILDE;&SUP3;")
|
||
'&ATILDE;&SUP3;'
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig(explain=False)
|
||
config = _config_from_kwargs(config, kwargs)
|
||
fixed, _explan = fix_encoding_and_explain(text, config)
|
||
return fixed
|
||
|
||
|
||
# Some alternate names for the main functions
|
||
ftfy = fix_text
|
||
|
||
|
||
def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
|
||
"""
|
||
Fix text as a single segment, with a consistent sequence of steps that
|
||
are applied to fix the text. Discard the explanation.
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig(explain=False)
|
||
config = _config_from_kwargs(config, kwargs)
|
||
fixed, _explan = fix_and_explain(text, config)
|
||
return fixed
|
||
|
||
|
||
def fix_file(input_file, encoding=None, config=None, **kwargs):
|
||
"""
|
||
Fix text that is found in a file.
|
||
|
||
If the file is being read as Unicode text, use that. If it's being read as
|
||
bytes, then we hope an encoding was supplied. If not, unfortunately, we
|
||
have to guess what encoding it is. We'll try a few common encodings, but we
|
||
make no promises. See the `guess_bytes` function for how this is done.
|
||
|
||
The output is a stream of fixed lines of text.
|
||
"""
|
||
if config is None:
|
||
config = TextFixerConfig()
|
||
config = _config_from_kwargs(config, kwargs)
|
||
|
||
for line in input_file:
|
||
if isinstance(line, bytes):
|
||
if encoding is None:
|
||
line, encoding = guess_bytes(line)
|
||
else:
|
||
line = line.decode(encoding)
|
||
if config.unescape_html == "auto" and "<" in line:
|
||
config = config._replace(unescape_html=False)
|
||
|
||
fixed_line, _explan = fix_and_explain(line, config)
|
||
yield fixed_line
|
||
|
||
|
||
def guess_bytes(bstring):
|
||
"""
|
||
NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
|
||
is not designed to be an encoding detector.
|
||
|
||
In the unfortunate situation that you have some bytes in an unknown
|
||
encoding, ftfy can guess a reasonable strategy for decoding them, by trying
|
||
a few common encodings that can be distinguished from each other.
|
||
|
||
Unlike the rest of ftfy, this may not be accurate, and it may *create*
|
||
Unicode problems instead of solving them!
|
||
|
||
The encodings we try here are:
|
||
|
||
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
|
||
like nothing else
|
||
- UTF-8, because it's the global standard, which has been used by a
|
||
majority of the Web since 2008
|
||
- "utf-8-variants", or buggy implementations of UTF-8
|
||
- MacRoman, because Microsoft Office thinks it's still a thing, and it
|
||
can be distinguished by its line breaks. (If there are no line breaks in
|
||
the string, though, you're out of luck.)
|
||
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
|
||
single-byte encoding.
|
||
"""
|
||
if isinstance(bstring, str):
|
||
raise UnicodeError(
|
||
"This string was already decoded as Unicode. You should pass "
|
||
"bytes to guess_bytes, not Unicode."
|
||
)
|
||
|
||
if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
|
||
return bstring.decode("utf-16"), "utf-16"
|
||
|
||
byteset = set(bstring)
|
||
try:
|
||
if 0xED in byteset or 0xC0 in byteset:
|
||
# Byte 0xed can be used to encode a range of codepoints that
|
||
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
|
||
# so when we see 0xed, it's very likely we're being asked to
|
||
# decode CESU-8, the variant that encodes UTF-16 surrogates
|
||
# instead of the original characters themselves.
|
||
#
|
||
# This will occasionally trigger on standard UTF-8, as there
|
||
# are some Korean characters that also use byte 0xed, but that's
|
||
# not harmful because standard UTF-8 characters will decode the
|
||
# same way in our 'utf-8-variants' codec.
|
||
#
|
||
# Byte 0xc0 is impossible because, numerically, it would only
|
||
# encode characters lower than U+0040. Those already have
|
||
# single-byte representations, and UTF-8 requires using the
|
||
# shortest possible representation. However, Java hides the null
|
||
# codepoint, U+0000, in a non-standard longer representation -- it
|
||
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
|
||
# will never appear in the encoded bytes.
|
||
#
|
||
# The 'utf-8-variants' decoder can handle both of these cases, as
|
||
# well as standard UTF-8, at the cost of a bit of speed.
|
||
return bstring.decode("utf-8-variants"), "utf-8-variants"
|
||
else:
|
||
return bstring.decode("utf-8"), "utf-8"
|
||
except UnicodeDecodeError:
|
||
pass
|
||
|
||
if 0x0D in byteset and 0x0A not in byteset:
|
||
# Files that contain CR and not LF are likely to be MacRoman.
|
||
return bstring.decode("macroman"), "macroman"
|
||
|
||
return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
|
||
|
||
|
||
@no_type_check
|
||
def apply_plan(text: str, plan: List[Tuple[str, str]]):
|
||
"""
|
||
Apply a plan for fixing the encoding of text.
|
||
|
||
The plan is a list of tuples of the form (operation, arg).
|
||
|
||
`operation` is one of:
|
||
|
||
- `'encode'`: convert a string to bytes, using `arg` as the encoding
|
||
- `'decode'`: convert bytes to a string, using `arg` as the encoding
|
||
- `'transcode'`: convert bytes to bytes, using the function named `arg`
|
||
- `'apply'`: convert a string to a string, using the function named `arg`
|
||
|
||
The functions that can be applied by 'transcode' and 'apply' are
|
||
specifically those that appear in the dictionary named `FIXERS`. They
|
||
can also can be imported from the `ftfy.fixes` module.
|
||
|
||
Example::
|
||
|
||
>>> mojibake = "schön"
|
||
>>> text, plan = fix_and_explain(mojibake)
|
||
>>> apply_plan(mojibake, plan)
|
||
'schön'
|
||
"""
|
||
obj = text
|
||
for operation, encoding in plan:
|
||
if operation == "encode":
|
||
obj = obj.encode(encoding)
|
||
elif operation == "decode":
|
||
obj = obj.decode(encoding)
|
||
elif operation in ("transcode", "apply"):
|
||
if encoding in FIXERS:
|
||
obj = FIXERS[encoding](obj)
|
||
else:
|
||
raise ValueError("Unknown function to apply: %s" % encoding)
|
||
else:
|
||
raise ValueError("Unknown plan step: %s" % operation)
|
||
|
||
return obj
|
||
|
||
|
||
def explain_unicode(text: str):
|
||
"""
|
||
A utility method that's useful for debugging mysterious Unicode.
|
||
|
||
It breaks down a string, showing you for each codepoint its number in
|
||
hexadecimal, its glyph, its category in the Unicode standard, and its name
|
||
in the Unicode standard.
|
||
|
||
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
|
||
U+0028 ( [Ps] LEFT PARENTHESIS
|
||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||
U+00B0 ° [So] DEGREE SIGN
|
||
U+25A1 □ [So] WHITE SQUARE
|
||
U+00B0 ° [So] DEGREE SIGN
|
||
U+0029 ) [Pe] RIGHT PARENTHESIS
|
||
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
||
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||
U+0020 [Zs] SPACE
|
||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
|
||
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
||
"""
|
||
for char in text:
|
||
if char.isprintable():
|
||
display = char
|
||
else:
|
||
display = char.encode("unicode-escape").decode("ascii")
|
||
print(
|
||
"U+{code:04X} {display} [{category}] {name}".format(
|
||
display=display_ljust(display, 7),
|
||
code=ord(char),
|
||
category=unicodedata.category(char),
|
||
name=unicodedata.name(char, "<unknown>"),
|
||
)
|
||
)
|