2018-10-31 16:08:29 +00:00
|
|
|
|
"""
|
|
|
|
|
ftfy: fixes text for you
|
|
|
|
|
|
|
|
|
|
This is a module for making text less broken. See the `fix_text` function
|
|
|
|
|
for more information.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import unicodedata
|
2022-01-24 04:07:52 +00:00
|
|
|
|
import warnings
|
|
|
|
|
from typing import List, NamedTuple, Optional, Tuple, Union
|
|
|
|
|
|
|
|
|
|
from ftfy import bad_codecs
|
|
|
|
|
from ftfy import chardata, fixes
|
|
|
|
|
from ftfy.badness import is_bad
|
2018-10-31 16:08:29 +00:00
|
|
|
|
from ftfy.formatting import display_ljust
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
__version__ = "6.0.3"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Though this function does nothing, it lets linters know that we're using
|
|
|
|
|
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
|
|
|
|
|
bad_codecs.ok()
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
class ExplainedText(NamedTuple):
|
|
|
|
|
"""
|
|
|
|
|
The return type from ftfy's functions that provide an "explanation" of which
|
|
|
|
|
steps it applied to fix the text, such as :func:`fix_and_explain()`.
|
|
|
|
|
|
|
|
|
|
When the 'explain' option is disabled, these functions return the same
|
|
|
|
|
type, but the `explanation` will be None.
|
|
|
|
|
"""
|
|
|
|
|
text: str
|
|
|
|
|
explanation: Optional[List[Tuple[str, str]]]
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
class TextFixerConfig(NamedTuple):
|
2018-10-31 16:08:29 +00:00
|
|
|
|
r"""
|
2022-01-24 04:07:52 +00:00
|
|
|
|
A TextFixerConfig object stores configuration options for ftfy.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
It's implemented as a namedtuple with defaults, so you can instantiate
|
|
|
|
|
it by providing the values to change from their defaults as keyword arguments.
|
|
|
|
|
For example, to disable 'unescape_html' and keep the rest of the defaults::
|
|
|
|
|
|
|
|
|
|
TextFixerConfig(unescape_html=False)
|
|
|
|
|
|
|
|
|
|
Here are the options and their default values:
|
|
|
|
|
|
|
|
|
|
- `unescape_html`: "auto"
|
|
|
|
|
|
|
|
|
|
Configures whether to replace HTML entities such as & with the character
|
|
|
|
|
they represent. "auto" says to do this by default, but disable it when a
|
|
|
|
|
literal < character appears, indicating that the input is actual HTML and
|
|
|
|
|
entities should be preserved. The value can be True, to always enable this
|
|
|
|
|
fixer, or False, to always disable it.
|
|
|
|
|
|
|
|
|
|
- `remove_terminal_escapes`: True
|
|
|
|
|
|
|
|
|
|
Removes "ANSI" terminal escapes, such as for changing the color of text in a
|
|
|
|
|
terminal window.
|
|
|
|
|
|
|
|
|
|
- `fix_encoding`: True
|
|
|
|
|
|
|
|
|
|
Detect mojibake and attempt to fix it by decoding the text in a different
|
|
|
|
|
encoding standard.
|
|
|
|
|
|
|
|
|
|
The following four options affect `fix_encoding` works, and do nothing if
|
|
|
|
|
`fix_encoding` is False:
|
|
|
|
|
|
|
|
|
|
- `restore_byte_a0`: True
|
|
|
|
|
|
|
|
|
|
Allow a literal space (U+20) to be interpreted as a non-breaking space
|
|
|
|
|
(U+A0) when that would make it part of a fixable mojibake string.
|
|
|
|
|
|
|
|
|
|
Because spaces are very common characters, this could lead to false
|
|
|
|
|
positives, but we try to apply it only when there's strong evidence for
|
|
|
|
|
mojibake. Disabling `restore_byte_a0` is safer from false positives,
|
|
|
|
|
but creates false negatives.
|
|
|
|
|
|
|
|
|
|
- `replace_lossy_sequences`: True
|
|
|
|
|
|
|
|
|
|
Detect mojibake that has been partially replaced by the characters
|
|
|
|
|
'<EFBFBD>' or '?'. If the mojibake could be decoded otherwise, replace the
|
|
|
|
|
detected sequence with '<EFBFBD>'.
|
|
|
|
|
|
|
|
|
|
- `decode_inconsistent_utf8`: True
|
|
|
|
|
|
|
|
|
|
When we see sequences that distinctly look like UTF-8 mojibake, but
|
|
|
|
|
there's no consistent way to reinterpret the string in a new encoding,
|
|
|
|
|
replace the mojibake with the appropriate UTF-8 characters anyway.
|
|
|
|
|
|
|
|
|
|
This helps to decode strings that are concatenated from different
|
|
|
|
|
encodings.
|
|
|
|
|
|
|
|
|
|
- `fix_c1_controls`: True
|
|
|
|
|
|
|
|
|
|
Replace C1 control characters (the useless characters U+80 - U+9B that
|
|
|
|
|
come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
|
|
|
|
|
even if the whole string doesn't decode as Latin-1.
|
|
|
|
|
|
|
|
|
|
- `fix_latin_ligatures`: True
|
|
|
|
|
|
|
|
|
|
Replace common Latin-alphabet ligatures, such as ``fi``, with the
|
|
|
|
|
letters they're made of.
|
|
|
|
|
|
|
|
|
|
- `fix_character_width`: True
|
|
|
|
|
|
|
|
|
|
Replace fullwidth Latin characters and halfwidth Katakana with
|
|
|
|
|
their more standard widths.
|
|
|
|
|
|
|
|
|
|
- `uncurl_quotes`: True
|
|
|
|
|
|
|
|
|
|
Replace curly quotes with straight quotes.
|
|
|
|
|
|
|
|
|
|
- `fix_line_breaks`: True
|
|
|
|
|
|
|
|
|
|
Replace various forms of line breaks with the standard Unix line
|
|
|
|
|
break, ``\n``.
|
|
|
|
|
|
|
|
|
|
- `fix_surrogates`: True
|
|
|
|
|
|
|
|
|
|
Replace sequences of UTF-16 surrogate codepoints with the character
|
|
|
|
|
they were meant to encode. This fixes text that was decoded with the
|
|
|
|
|
obsolete UCS-2 standard, and allows it to support high-numbered
|
|
|
|
|
codepoints such as emoji.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
- `remove_control_chars`: True
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
Remove certain control characters that have no displayed effect on text.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
- `normalization`: "NFC"
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
Choose what kind of Unicode normalization is applied. Usually, we apply
|
|
|
|
|
NFC normalization, so that letters followed by combining characters become
|
|
|
|
|
single combined characters.
|
|
|
|
|
|
|
|
|
|
Changing this to "NFKC" applies more compatibility conversions, such as
|
|
|
|
|
replacing the 'micro sign' with a standard Greek lowercase mu, which looks
|
|
|
|
|
identical. However, some NFKC normalizations change the meaning of text,
|
|
|
|
|
such as converting "10³" to "103".
|
|
|
|
|
|
|
|
|
|
`normalization` can be None, to apply no normalization.
|
|
|
|
|
|
|
|
|
|
- `max_decode_length`: 1_000_000
|
|
|
|
|
|
|
|
|
|
The maximum size of "segment" that ftfy will try to fix all at once.
|
|
|
|
|
|
|
|
|
|
- `explain`: True
|
|
|
|
|
|
|
|
|
|
Whether to compute 'explanations', lists describing what ftfy changed.
|
|
|
|
|
When this is False, the explanation will be None, and the code that
|
|
|
|
|
builds the explanation will be skipped, possibly saving time.
|
|
|
|
|
|
|
|
|
|
Functions that accept TextFixerConfig and don't return an explanation
|
|
|
|
|
will automatically set `explain` to False.
|
|
|
|
|
"""
|
|
|
|
|
unescape_html: Union[str, bool] = "auto"
|
|
|
|
|
remove_terminal_escapes: bool = True
|
|
|
|
|
fix_encoding: bool = True
|
|
|
|
|
restore_byte_a0: bool = True
|
|
|
|
|
replace_lossy_sequences: bool = True
|
|
|
|
|
decode_inconsistent_utf8: bool = True
|
|
|
|
|
fix_c1_controls: bool = True
|
|
|
|
|
fix_latin_ligatures: bool = True
|
|
|
|
|
fix_character_width: bool = True
|
|
|
|
|
uncurl_quotes: bool = True
|
|
|
|
|
fix_line_breaks: bool = True
|
|
|
|
|
fix_surrogates: bool = True
|
|
|
|
|
remove_control_chars: bool = True
|
|
|
|
|
normalization: Optional[str] = "NFC"
|
|
|
|
|
max_decode_length: int = 1000000
|
|
|
|
|
explain: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _config_from_kwargs(config: TextFixerConfig, kwargs: dict):
|
|
|
|
|
"""
|
|
|
|
|
Handle parameters provided as keyword arguments to ftfy's top-level
|
|
|
|
|
functions, converting them into a TextFixerConfig.
|
|
|
|
|
"""
|
|
|
|
|
if 'fix_entities' in kwargs:
|
|
|
|
|
warnings.warn(
|
|
|
|
|
"`fix_entities` has been renamed to `unescape_html`",
|
|
|
|
|
DeprecationWarning
|
|
|
|
|
)
|
|
|
|
|
kwargs = kwargs.copy()
|
|
|
|
|
kwargs['unescape_html'] = kwargs['fix_entities']
|
|
|
|
|
del kwargs['fix_entities']
|
|
|
|
|
config = config._replace(**kwargs)
|
|
|
|
|
return config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FIXERS = {
|
|
|
|
|
"unescape_html": fixes.unescape_html,
|
|
|
|
|
"remove_terminal_escapes": fixes.remove_terminal_escapes,
|
|
|
|
|
"restore_byte_a0": fixes.restore_byte_a0,
|
|
|
|
|
"replace_lossy_sequences": fixes.replace_lossy_sequences,
|
|
|
|
|
"decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
|
|
|
|
|
"fix_c1_controls": fixes.fix_c1_controls,
|
|
|
|
|
"fix_latin_ligatures": fixes.fix_latin_ligatures,
|
|
|
|
|
"fix_character_width": fixes.fix_character_width,
|
|
|
|
|
"uncurl_quotes": fixes.uncurl_quotes,
|
|
|
|
|
"fix_line_breaks": fixes.fix_line_breaks,
|
|
|
|
|
"fix_surrogates": fixes.fix_surrogates,
|
|
|
|
|
"remove_control_chars": fixes.remove_control_chars,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
|
|
|
|
|
|
|
|
|
|
ftfy is designed to fix problems with text. Treating bytes like they're
|
|
|
|
|
interchangeable with Unicode text is usually something that introduces
|
|
|
|
|
problems with text.
|
|
|
|
|
|
|
|
|
|
You should first decode these bytes from the encoding you think they're in.
|
|
|
|
|
If you're not sure what encoding they're in:
|
|
|
|
|
|
|
|
|
|
- First, try to find out. 'utf-8' is a good assumption.
|
|
|
|
|
- If the encoding is simply unknowable, try running your bytes through
|
|
|
|
|
ftfy.guess_bytes. As the name implies, this may not always be accurate.
|
|
|
|
|
|
|
|
|
|
For more information on the distinction between bytes and text, read the
|
|
|
|
|
Python Unicode HOWTO:
|
|
|
|
|
|
|
|
|
|
http://docs.python.org/3/howto/unicode.html
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def _try_fix(
|
|
|
|
|
fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
|
|
|
|
|
) -> str:
|
|
|
|
|
"""
|
|
|
|
|
A helper function used across several 'fixer' steps, deciding whether to
|
|
|
|
|
apply the fix and whether to record the fix in `steps`.
|
|
|
|
|
"""
|
|
|
|
|
if getattr(config, fixer_name):
|
|
|
|
|
fixer = FIXERS[fixer_name]
|
|
|
|
|
fixed = fixer(text)
|
|
|
|
|
if steps is not None and fixed != text:
|
|
|
|
|
steps.append(("apply", fixer_name))
|
|
|
|
|
return fixed
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
|
|
|
|
|
r"""
|
|
|
|
|
Given Unicode text as input, fix inconsistencies and glitches in it,
|
|
|
|
|
such as mojibake (text that was decoded in the wrong encoding).
|
|
|
|
|
|
|
|
|
|
Let's start with some examples:
|
|
|
|
|
|
|
|
|
|
>>> fix_text('✔ No problems')
|
|
|
|
|
'✔ No problems'
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
>>> print(fix_text("¯\\_(ã\x83\x84)_/¯"))
|
|
|
|
|
¯\_(ツ)_/¯
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
>>> fix_text('Broken text… it’s flubberific!')
|
|
|
|
|
"Broken text... it's flubberific!"
|
|
|
|
|
|
|
|
|
|
>>> fix_text('LOUD NOISES')
|
|
|
|
|
'LOUD NOISES'
|
|
|
|
|
|
|
|
|
|
ftfy applies a number of different fixes to the text, and can accept
|
|
|
|
|
configuration to select which fixes to apply.
|
|
|
|
|
|
|
|
|
|
The configuration takes the form of a :class:`TextFixerConfig` object,
|
|
|
|
|
and you can see a description of the options in that class's docstring
|
|
|
|
|
or in the full documentation at ftfy.readthedocs.org.
|
|
|
|
|
|
|
|
|
|
For convenience and backward compatibility, the configuration can also
|
|
|
|
|
take the form of keyword arguments, which will set the equivalently-named
|
|
|
|
|
fields of the TextFixerConfig object.
|
|
|
|
|
|
|
|
|
|
For example, here are two ways to fix text but skip the "uncurl_quotes"
|
|
|
|
|
step::
|
|
|
|
|
|
|
|
|
|
fix_text(text, TextFixerConfig(uncurl_quotes=False))
|
|
|
|
|
fix_text(text, uncurl_quotes=False)
|
|
|
|
|
|
|
|
|
|
This function fixes text in independent segments, which are usually lines
|
|
|
|
|
of text, or arbitrarily broken up every 1 million codepoints (configurable
|
|
|
|
|
with `config.max_decode_length`) if there aren't enough line breaks. The
|
|
|
|
|
bound on segment lengths helps to avoid unbounded slowdowns.
|
|
|
|
|
|
|
|
|
|
ftfy can also provide an 'explanation', a list of transformations it applied
|
|
|
|
|
to the text that would fix more text like it. This function doesn't provide
|
|
|
|
|
explanations (because there may be different fixes for different segments
|
|
|
|
|
of text).
|
|
|
|
|
|
|
|
|
|
To get an explanation, use the :func:`fix_and_explain()` function, which
|
|
|
|
|
fixes the string in one segment and explains what it fixed.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
"""
|
2022-01-24 04:07:52 +00:00
|
|
|
|
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig(explain=False)
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
2018-10-31 16:08:29 +00:00
|
|
|
|
if isinstance(text, bytes):
|
2022-01-24 04:07:52 +00:00
|
|
|
|
raise UnicodeError(BYTES_ERROR_TEXT)
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
out = []
|
|
|
|
|
pos = 0
|
|
|
|
|
while pos < len(text):
|
2022-01-24 04:07:52 +00:00
|
|
|
|
textbreak = text.find("\n", pos) + 1
|
2018-10-31 16:08:29 +00:00
|
|
|
|
if textbreak == 0:
|
|
|
|
|
textbreak = len(text)
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if (textbreak - pos) > config.max_decode_length:
|
|
|
|
|
textbreak = pos + config.max_decode_length
|
|
|
|
|
|
|
|
|
|
segment = text[pos:textbreak]
|
|
|
|
|
if config.unescape_html == "auto" and "<" in segment:
|
|
|
|
|
config = config._replace(unescape_html=False)
|
|
|
|
|
fixed_segment, _ = fix_and_explain(segment, config)
|
|
|
|
|
out.append(fixed_segment)
|
2018-10-31 16:08:29 +00:00
|
|
|
|
pos = textbreak
|
2022-01-24 04:07:52 +00:00
|
|
|
|
return "".join(out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_and_explain(
|
|
|
|
|
text: str, config: Optional[TextFixerConfig] = None, **kwargs
|
|
|
|
|
) -> ExplainedText:
|
|
|
|
|
"""
|
|
|
|
|
Fix text as a single segment, returning the fixed text and an explanation
|
|
|
|
|
of what was fixed.
|
|
|
|
|
|
|
|
|
|
The explanation is a list of steps that can be applied with
|
|
|
|
|
:func:`apply_plan`, or if config.explain is False, it will be None.
|
|
|
|
|
"""
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig()
|
|
|
|
|
if isinstance(text, bytes):
|
|
|
|
|
raise UnicodeError(BYTES_ERROR_TEXT)
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
|
|
|
|
|
|
|
|
|
if config.unescape_html == "auto" and "<" in text:
|
|
|
|
|
config = config._replace(unescape_html=False)
|
|
|
|
|
|
|
|
|
|
if config.explain:
|
|
|
|
|
steps: Optional[List[Tuple[str, str]]] = []
|
|
|
|
|
else:
|
|
|
|
|
# If explanations aren't desired, `steps` will be None
|
|
|
|
|
steps = None
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
origtext = text
|
|
|
|
|
|
|
|
|
|
text = _try_fix("unescape_html", text, config, steps)
|
|
|
|
|
|
|
|
|
|
if config.fix_encoding:
|
|
|
|
|
if steps is None:
|
|
|
|
|
text = fix_encoding(text)
|
|
|
|
|
else:
|
|
|
|
|
text, encoding_steps = fix_encoding_and_explain(text, config)
|
|
|
|
|
steps.extend(encoding_steps)
|
|
|
|
|
|
|
|
|
|
for fixer in [
|
|
|
|
|
"fix_c1_controls",
|
|
|
|
|
"fix_latin_ligatures",
|
|
|
|
|
"fix_character_width",
|
|
|
|
|
"uncurl_quotes",
|
|
|
|
|
"fix_line_breaks",
|
|
|
|
|
"fix_surrogates",
|
|
|
|
|
"remove_terminal_escapes",
|
|
|
|
|
"remove_control_chars",
|
|
|
|
|
]:
|
|
|
|
|
text = _try_fix(fixer, text, config, steps)
|
|
|
|
|
|
|
|
|
|
if config.normalization is not None:
|
|
|
|
|
fixed = unicodedata.normalize(config.normalization, text)
|
|
|
|
|
if steps is not None and fixed != text:
|
|
|
|
|
steps.append(("normalize", config.normalization))
|
|
|
|
|
text = fixed
|
|
|
|
|
|
|
|
|
|
if text == origtext:
|
|
|
|
|
return ExplainedText(text, steps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_encoding_and_explain(
|
|
|
|
|
text: str, config: Optional[TextFixerConfig] = None, **kwargs
|
|
|
|
|
) -> ExplainedText:
|
|
|
|
|
"""
|
|
|
|
|
Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
|
|
|
|
|
text and a list explaining what was fixed.
|
|
|
|
|
|
|
|
|
|
This includes fixing text by encoding and decoding it in different encodings,
|
|
|
|
|
as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
|
|
|
|
|
`decode_inconsistent_utf8`, and `fix_c1_controls`.
|
|
|
|
|
|
|
|
|
|
Examples::
|
|
|
|
|
|
|
|
|
|
>>> fix_encoding_and_explain("só")
|
|
|
|
|
ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
|
|
|
|
|
|
|
|
|
|
>>> result = fix_encoding_and_explain("voilà le travail")
|
|
|
|
|
>>> result.text
|
|
|
|
|
'voilà le travail'
|
|
|
|
|
>>> result.explanation
|
|
|
|
|
[('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig()
|
|
|
|
|
if isinstance(text, bytes):
|
|
|
|
|
raise UnicodeError(BYTES_ERROR_TEXT)
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
|
|
|
|
|
|
|
|
|
if not config.fix_encoding:
|
|
|
|
|
# A weird trivial case: we're asked to fix the encoding, but skip
|
|
|
|
|
# fixing the encoding
|
|
|
|
|
return ExplainedText(text, [])
|
|
|
|
|
|
|
|
|
|
plan_so_far: List[Tuple[str, str]] = []
|
|
|
|
|
while True:
|
|
|
|
|
prevtext = text
|
|
|
|
|
text, plan = _fix_encoding_one_step_and_explain(text, config)
|
|
|
|
|
plan_so_far.extend(plan)
|
|
|
|
|
if text == prevtext:
|
|
|
|
|
return ExplainedText(text, plan_so_far)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _fix_encoding_one_step_and_explain(
|
|
|
|
|
text: str, config: TextFixerConfig
|
|
|
|
|
) -> ExplainedText:
|
|
|
|
|
"""
|
|
|
|
|
Perform one step of fixing the encoding of text.
|
|
|
|
|
"""
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig()
|
|
|
|
|
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
return ExplainedText(text, [])
|
|
|
|
|
|
|
|
|
|
# The first plan is to return ASCII text unchanged, as well as text
|
|
|
|
|
# that doesn't look like it contains mojibake
|
|
|
|
|
if chardata.possible_encoding(text, "ascii") or not is_bad(text):
|
|
|
|
|
return ExplainedText(text, [])
|
|
|
|
|
|
|
|
|
|
# As we go through the next step, remember the possible encodings
|
|
|
|
|
# that we encounter but don't successfully fix yet. We may need them
|
|
|
|
|
# later.
|
|
|
|
|
possible_1byte_encodings = []
|
|
|
|
|
|
|
|
|
|
# Suppose the text was supposed to be UTF-8, but it was decoded using
|
|
|
|
|
# a single-byte encoding instead. When these cases can be fixed, they
|
|
|
|
|
# are usually the correct thing to do, so try them next.
|
|
|
|
|
for encoding in chardata.CHARMAP_ENCODINGS:
|
|
|
|
|
if chardata.possible_encoding(text, encoding):
|
|
|
|
|
possible_1byte_encodings.append(encoding)
|
|
|
|
|
encoded_bytes = text.encode(encoding)
|
|
|
|
|
encode_step = ("encode", encoding)
|
|
|
|
|
transcode_steps = []
|
|
|
|
|
|
|
|
|
|
# Now, find out if it's UTF-8 (or close enough). Otherwise,
|
|
|
|
|
# remember the encoding for later.
|
|
|
|
|
try:
|
|
|
|
|
decoding = "utf-8"
|
|
|
|
|
# Check encoded_bytes for sequences that would be UTF-8,
|
|
|
|
|
# except they have b' ' where b'\xa0' would belong.
|
|
|
|
|
if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
|
|
|
|
|
encoded_bytes
|
|
|
|
|
):
|
|
|
|
|
replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
|
|
|
|
|
if replaced_bytes != encoded_bytes:
|
|
|
|
|
transcode_steps.append(("transcode", "restore_byte_a0"))
|
|
|
|
|
encoded_bytes = replaced_bytes
|
|
|
|
|
|
|
|
|
|
# Replace sequences where information has been lost
|
|
|
|
|
if config.replace_lossy_sequences and encoding.startswith("sloppy"):
|
|
|
|
|
replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
|
|
|
|
|
if replaced_bytes != encoded_bytes:
|
|
|
|
|
transcode_steps.append(("transcode", "replace_lossy_sequences"))
|
|
|
|
|
encoded_bytes = replaced_bytes
|
|
|
|
|
|
|
|
|
|
if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
|
|
|
|
|
decoding = "utf-8-variants"
|
|
|
|
|
|
|
|
|
|
decode_step = ("decode", decoding)
|
|
|
|
|
steps = [encode_step] + transcode_steps + [decode_step]
|
|
|
|
|
fixed = encoded_bytes.decode(decoding)
|
|
|
|
|
return ExplainedText(fixed, steps)
|
|
|
|
|
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Look for a-hat-euro sequences that remain, and fix them in isolation.
|
|
|
|
|
if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
|
|
|
|
|
steps = [("apply", "decode_inconsistent_utf8")]
|
|
|
|
|
fixed = fixes.decode_inconsistent_utf8(text)
|
|
|
|
|
if fixed != text:
|
|
|
|
|
return ExplainedText(fixed, steps)
|
|
|
|
|
|
|
|
|
|
# The next most likely case is that this is Latin-1 that was intended to
|
|
|
|
|
# be read as Windows-1252, because those two encodings in particular are
|
|
|
|
|
# easily confused.
|
|
|
|
|
if "latin-1" in possible_1byte_encodings:
|
|
|
|
|
if "windows-1252" in possible_1byte_encodings:
|
|
|
|
|
# This text is in the intersection of Latin-1 and
|
|
|
|
|
# Windows-1252, so it's probably legit.
|
|
|
|
|
return ExplainedText(text, [])
|
|
|
|
|
else:
|
|
|
|
|
# Otherwise, it means we have characters that are in Latin-1 but
|
|
|
|
|
# not in Windows-1252. Those are C1 control characters. Nobody
|
|
|
|
|
# wants those. Assume they were meant to be Windows-1252.
|
|
|
|
|
try:
|
|
|
|
|
fixed = text.encode("latin-1").decode("windows-1252")
|
|
|
|
|
if fixed != text:
|
|
|
|
|
steps = [("encode", "latin-1"), ("decode", "windows-1252")]
|
|
|
|
|
return ExplainedText(fixed, steps)
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Fix individual characters of Latin-1 with a less satisfying explanation
|
|
|
|
|
if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
|
|
|
|
|
steps = [("transcode", "fix_c1_controls")]
|
|
|
|
|
fixed = fixes.fix_c1_controls(text)
|
|
|
|
|
return ExplainedText(fixed, steps)
|
|
|
|
|
|
|
|
|
|
# The cases that remain are mixups between two different single-byte
|
|
|
|
|
# encodings, and not the common case of Latin-1 vs. Windows-1252.
|
|
|
|
|
#
|
|
|
|
|
# With the new heuristic in 6.0, it's possible that we're closer to solving
|
|
|
|
|
# these in some cases. It would require a lot of testing and tuning, though.
|
|
|
|
|
# For now, we leave the text unchanged in these cases.
|
|
|
|
|
return ExplainedText(text, [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Apply just the encoding-fixing steps of ftfy to this text. Returns the
|
|
|
|
|
fixed text, discarding the explanation.
|
|
|
|
|
|
|
|
|
|
>>> fix_encoding("ó")
|
|
|
|
|
'ó'
|
|
|
|
|
>>> fix_encoding("&ATILDE;&SUP3;")
|
|
|
|
|
'&ATILDE;&SUP3;'
|
|
|
|
|
"""
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig(explain=False)
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
|
|
|
|
fixed, _explan = fix_encoding_and_explain(text, config)
|
|
|
|
|
return fixed
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Some alternate names for the main functions
|
|
|
|
|
ftfy = fix_text
|
2022-01-24 04:07:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Fix text as a single segment, with a consistent sequence of steps that
|
|
|
|
|
are applied to fix the text. Discard the explanation.
|
|
|
|
|
"""
|
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig(explain=False)
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
|
|
|
|
fixed, _explan = fix_and_explain(text, config)
|
|
|
|
|
return fixed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fix_file(input_file, encoding=None, config=None, **kwargs):
|
2018-10-31 16:08:29 +00:00
|
|
|
|
"""
|
|
|
|
|
Fix text that is found in a file.
|
|
|
|
|
|
|
|
|
|
If the file is being read as Unicode text, use that. If it's being read as
|
|
|
|
|
bytes, then we hope an encoding was supplied. If not, unfortunately, we
|
|
|
|
|
have to guess what encoding it is. We'll try a few common encodings, but we
|
|
|
|
|
make no promises. See the `guess_bytes` function for how this is done.
|
|
|
|
|
|
|
|
|
|
The output is a stream of fixed lines of text.
|
|
|
|
|
"""
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if config is None:
|
|
|
|
|
config = TextFixerConfig()
|
|
|
|
|
config = _config_from_kwargs(config, kwargs)
|
|
|
|
|
|
2018-10-31 16:08:29 +00:00
|
|
|
|
for line in input_file:
|
|
|
|
|
if isinstance(line, bytes):
|
|
|
|
|
if encoding is None:
|
|
|
|
|
line, encoding = guess_bytes(line)
|
|
|
|
|
else:
|
|
|
|
|
line = line.decode(encoding)
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if config.unescape_html == "auto" and "<" in line:
|
|
|
|
|
config = config._replace(unescape_html=False)
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
fixed_line, _explan = fix_and_explain(line, config)
|
|
|
|
|
yield fixed_line
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def guess_bytes(bstring):
|
|
|
|
|
"""
|
|
|
|
|
NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
|
|
|
|
|
is not designed to be an encoding detector.
|
|
|
|
|
|
|
|
|
|
In the unfortunate situation that you have some bytes in an unknown
|
|
|
|
|
encoding, ftfy can guess a reasonable strategy for decoding them, by trying
|
|
|
|
|
a few common encodings that can be distinguished from each other.
|
|
|
|
|
|
|
|
|
|
Unlike the rest of ftfy, this may not be accurate, and it may *create*
|
|
|
|
|
Unicode problems instead of solving them!
|
|
|
|
|
|
|
|
|
|
The encodings we try here are:
|
|
|
|
|
|
|
|
|
|
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
|
|
|
|
|
like nothing else
|
|
|
|
|
- UTF-8, because it's the global standard, which has been used by a
|
|
|
|
|
majority of the Web since 2008
|
2022-01-24 04:07:52 +00:00
|
|
|
|
- "utf-8-variants", or buggy implementations of UTF-8
|
2018-10-31 16:08:29 +00:00
|
|
|
|
- MacRoman, because Microsoft Office thinks it's still a thing, and it
|
|
|
|
|
can be distinguished by its line breaks. (If there are no line breaks in
|
|
|
|
|
the string, though, you're out of luck.)
|
|
|
|
|
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
|
2022-01-24 04:07:52 +00:00
|
|
|
|
single-byte encoding.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
"""
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if isinstance(bstring, str):
|
2018-10-31 16:08:29 +00:00
|
|
|
|
raise UnicodeError(
|
|
|
|
|
"This string was already decoded as Unicode. You should pass "
|
|
|
|
|
"bytes to guess_bytes, not Unicode."
|
|
|
|
|
)
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
|
|
|
|
|
return bstring.decode("utf-16"), "utf-16"
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
byteset = set(bstring)
|
2018-10-31 16:08:29 +00:00
|
|
|
|
try:
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if 0xED in byteset or 0xC0 in byteset:
|
2018-10-31 16:08:29 +00:00
|
|
|
|
# Byte 0xed can be used to encode a range of codepoints that
|
|
|
|
|
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
|
|
|
|
|
# so when we see 0xed, it's very likely we're being asked to
|
|
|
|
|
# decode CESU-8, the variant that encodes UTF-16 surrogates
|
|
|
|
|
# instead of the original characters themselves.
|
|
|
|
|
#
|
|
|
|
|
# This will occasionally trigger on standard UTF-8, as there
|
|
|
|
|
# are some Korean characters that also use byte 0xed, but that's
|
2022-01-24 04:07:52 +00:00
|
|
|
|
# not harmful because standard UTF-8 characters will decode the
|
|
|
|
|
# same way in our 'utf-8-variants' codec.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
#
|
|
|
|
|
# Byte 0xc0 is impossible because, numerically, it would only
|
|
|
|
|
# encode characters lower than U+0040. Those already have
|
|
|
|
|
# single-byte representations, and UTF-8 requires using the
|
|
|
|
|
# shortest possible representation. However, Java hides the null
|
|
|
|
|
# codepoint, U+0000, in a non-standard longer representation -- it
|
|
|
|
|
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
|
|
|
|
|
# will never appear in the encoded bytes.
|
|
|
|
|
#
|
|
|
|
|
# The 'utf-8-variants' decoder can handle both of these cases, as
|
|
|
|
|
# well as standard UTF-8, at the cost of a bit of speed.
|
2022-01-24 04:07:52 +00:00
|
|
|
|
return bstring.decode("utf-8-variants"), "utf-8-variants"
|
2018-10-31 16:08:29 +00:00
|
|
|
|
else:
|
2022-01-24 04:07:52 +00:00
|
|
|
|
return bstring.decode("utf-8"), "utf-8"
|
2018-10-31 16:08:29 +00:00
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if 0x0D in byteset and 0x0A not in byteset:
|
|
|
|
|
# Files that contain CR and not LF are likely to be MacRoman.
|
|
|
|
|
return bstring.decode("macroman"), "macroman"
|
|
|
|
|
|
|
|
|
|
return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def apply_plan(text: str, plan: List[Tuple[str, str]]):
|
|
|
|
|
"""
|
|
|
|
|
Apply a plan for fixing the encoding of text.
|
|
|
|
|
|
|
|
|
|
The plan is a list of tuples of the form (operation, arg).
|
|
|
|
|
|
|
|
|
|
`operation` is one of:
|
|
|
|
|
|
|
|
|
|
- `'encode'`: convert a string to bytes, using `arg` as the encoding
|
|
|
|
|
- `'decode'`: convert bytes to a string, using `arg` as the encoding
|
|
|
|
|
- `'transcode'`: convert bytes to bytes, using the function named `arg`
|
|
|
|
|
- `'apply'`: convert a string to a string, using the function named `arg`
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
The functions that can be applied by 'transcode' and 'apply' are
|
|
|
|
|
specifically those that appear in the dictionary named `FIXERS`. They
|
|
|
|
|
can also can be imported from the `ftfy.fixes` module.
|
2018-10-31 16:08:29 +00:00
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
|
Example::
|
|
|
|
|
|
|
|
|
|
>>> mojibake = "schön"
|
|
|
|
|
>>> text, plan = fix_and_explain(mojibake)
|
|
|
|
|
>>> apply_plan(mojibake, plan)
|
|
|
|
|
'schön'
|
|
|
|
|
"""
|
|
|
|
|
obj = text
|
|
|
|
|
for operation, encoding in plan:
|
|
|
|
|
if operation == "encode":
|
|
|
|
|
obj = obj.encode(encoding)
|
|
|
|
|
elif operation == "decode":
|
|
|
|
|
obj = obj.decode(encoding)
|
|
|
|
|
elif operation in ("transcode", "apply"):
|
|
|
|
|
if encoding in FIXERS:
|
|
|
|
|
obj = FIXERS[encoding](obj)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Unknown function to apply: %s" % encoding)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Unknown plan step: %s" % operation)
|
|
|
|
|
|
|
|
|
|
return obj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def explain_unicode(text: str):
|
2018-10-31 16:08:29 +00:00
|
|
|
|
"""
|
|
|
|
|
A utility method that's useful for debugging mysterious Unicode.
|
|
|
|
|
|
|
|
|
|
It breaks down a string, showing you for each codepoint its number in
|
|
|
|
|
hexadecimal, its glyph, its category in the Unicode standard, and its name
|
|
|
|
|
in the Unicode standard.
|
|
|
|
|
|
|
|
|
|
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
|
|
|
|
|
U+0028 ( [Ps] LEFT PARENTHESIS
|
|
|
|
|
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
|
|
|
|
U+00B0 ° [So] DEGREE SIGN
|
|
|
|
|
U+25A1 □ [So] WHITE SQUARE
|
|
|
|
|
U+00B0 ° [So] DEGREE SIGN
|
|
|
|
|
U+0029 ) [Pe] RIGHT PARENTHESIS
|
|
|
|
|
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
|
|
|
|
|
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
|
|
|
|
U+0020 [Zs] SPACE
|
|
|
|
|
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
|
|
|
|
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
|
|
|
|
|
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
|
|
|
|
|
"""
|
|
|
|
|
for char in text:
|
2022-01-24 04:07:52 +00:00
|
|
|
|
if char.isprintable():
|
2018-10-31 16:08:29 +00:00
|
|
|
|
display = char
|
|
|
|
|
else:
|
2022-01-24 04:07:52 +00:00
|
|
|
|
display = char.encode("unicode-escape").decode("ascii")
|
|
|
|
|
print(
|
|
|
|
|
"U+{code:04X} {display} [{category}] {name}".format(
|
|
|
|
|
display=display_ljust(display, 7),
|
|
|
|
|
code=ord(char),
|
|
|
|
|
category=unicodedata.category(char),
|
|
|
|
|
name=unicodedata.name(char, "<unknown>"),
|
|
|
|
|
)
|
|
|
|
|
)
|