mirror of https://github.com/morpheus65535/bazarr
316 lines
12 KiB
Python
316 lines
12 KiB
Python
"""
|
||
This gives other modules access to the gritty details about characters and the
|
||
encodings that use them.
|
||
"""
|
||
|
||
import html
|
||
import itertools
|
||
import re
|
||
import unicodedata
|
||
|
||
|
||
# These are the encodings we will try to fix in ftfy, in the
|
||
# order that they should be tried.
|
||
CHARMAP_ENCODINGS = [
|
||
"latin-1",
|
||
"sloppy-windows-1252",
|
||
"sloppy-windows-1251",
|
||
"sloppy-windows-1250",
|
||
"sloppy-windows-1253",
|
||
"sloppy-windows-1254",
|
||
"iso-8859-2",
|
||
"macroman",
|
||
"cp437",
|
||
]
|
||
|
||
SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")
|
||
DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")
|
||
|
||
|
||
def _build_regexes():
|
||
"""
|
||
ENCODING_REGEXES contain reasonably fast ways to detect if we
|
||
could represent a given string in a given encoding. The simplest one is
|
||
the 'ascii' detector, which of course just determines if all characters
|
||
are between U+0000 and U+007F.
|
||
"""
|
||
# Define a regex that matches ASCII text.
|
||
encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}
|
||
|
||
for encoding in CHARMAP_ENCODINGS:
|
||
# Make a sequence of characters that bytes \x80 to \xFF decode to
|
||
# in each encoding, as well as byte \x1A, which is used to represent
|
||
# the replacement character <20> in the sloppy-* encodings.
|
||
byte_range = bytes(list(range(0x80, 0x100)) + [0x1A])
|
||
charlist = byte_range.decode(encoding)
|
||
|
||
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
|
||
# to \x7F -- will decode as those ASCII characters in any encoding we
|
||
# support, so we can just include them as ranges. This also lets us
|
||
# not worry about escaping regex special characters, because all of
|
||
# them are in the \x1B to \x7F range.
|
||
regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist)
|
||
encoding_regexes[encoding] = re.compile(regex)
|
||
return encoding_regexes
|
||
|
||
|
||
ENCODING_REGEXES = _build_regexes()
|
||
|
||
|
||
def _build_html_entities():
|
||
entities = {}
|
||
# Create a dictionary based on the built-in HTML5 entity dictionary.
|
||
# Add a limited set of HTML entities that we'll also decode if they've
|
||
# been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".
|
||
for name, char in html.entities.html5.items():
|
||
if name.endswith(";"):
|
||
entities["&" + name] = char
|
||
|
||
# Restrict the set of characters we can attempt to decode if their
|
||
# name has been uppercased. If we tried to handle all entity names,
|
||
# the results would be ambiguous.
|
||
if name == name.lower():
|
||
name_upper = name.upper()
|
||
entity_upper = "&" + name_upper
|
||
if html.unescape(entity_upper) == entity_upper:
|
||
entities[entity_upper] = char.upper()
|
||
return entities
|
||
|
||
|
||
HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
|
||
HTML_ENTITIES = _build_html_entities()
|
||
|
||
|
||
def possible_encoding(text, encoding):
|
||
"""
|
||
Given text and a single-byte encoding, check whether that text could have
|
||
been decoded from that single-byte encoding.
|
||
|
||
In other words, check whether it can be encoded in that encoding, possibly
|
||
sloppily.
|
||
"""
|
||
return bool(ENCODING_REGEXES[encoding].match(text))
|
||
|
||
|
||
def _build_control_char_mapping():
|
||
"""
|
||
Build a translate mapping that strips likely-unintended control characters.
|
||
See :func:`ftfy.fixes.remove_control_chars` for a description of these
|
||
codepoint ranges and why they should be removed.
|
||
"""
|
||
control_chars = {}
|
||
|
||
for i in itertools.chain(
|
||
range(0x00, 0x09),
|
||
[0x0B],
|
||
range(0x0E, 0x20),
|
||
[0x7F],
|
||
range(0x206A, 0x2070),
|
||
[0xFEFF],
|
||
range(0xFFF9, 0xFFFD),
|
||
):
|
||
control_chars[i] = None
|
||
|
||
return control_chars
|
||
|
||
|
||
CONTROL_CHARS = _build_control_char_mapping()
|
||
|
||
|
||
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
|
||
# that some Windows-1252 program converted to a plain space.
|
||
#
|
||
# The smaller values are included on a case-by-case basis, because we don't want
|
||
# to decode likely input sequences to unlikely characters. These are the ones
|
||
# that *do* form likely characters before 0xa0:
|
||
#
|
||
# 0xc2 -> U+A0 NO-BREAK SPACE
|
||
# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
|
||
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
|
||
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
|
||
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
|
||
# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
|
||
#
|
||
# In three-character sequences, we exclude some lead bytes in some cases.
|
||
#
|
||
# When the lead byte is immediately followed by 0xA0, we shouldn't accept
|
||
# a space there, because it leads to some less-likely character ranges:
|
||
#
|
||
# 0xe0 -> Samaritan script
|
||
# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
|
||
#
|
||
# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
|
||
# higher point mostly to CJK characters, which we generally don't want to
|
||
# decode near Latin lowercase letters.
|
||
#
|
||
# In four-character sequences, the lead byte must be F0, because that accounts
|
||
# for almost all of the usage of high-numbered codepoints (tag characters whose
|
||
# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
|
||
#
|
||
# This is meant to be applied to encodings of text that tests true for `is_bad`.
|
||
# Any of these could represent characters that legitimately appear surrounded by
|
||
# spaces, particularly U+C5 (Å), which is a word in multiple languages!
|
||
#
|
||
# We should consider checking for b'\x85' being converted to ... in the future.
|
||
# I've seen it once, but the text still wasn't recoverable.
|
||
|
||
ALTERED_UTF8_RE = re.compile(
|
||
b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"
|
||
b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"
|
||
b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"
|
||
b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"
|
||
b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"
|
||
b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"
|
||
)
|
||
|
||
|
||
# This expression matches UTF-8 and CESU-8 sequences where some of the
|
||
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
|
||
# used within ftfy to represent a byte that produced the replacement character
|
||
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
|
||
# sequence as \ufffd instead of failing to re-decode it at all.
|
||
#
|
||
# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
|
||
# sequence.
|
||
LOSSY_UTF8_RE = re.compile(
|
||
b"[\xc2-\xdf][\x1a]"
|
||
b"|[\xc2-\xc3][?]"
|
||
b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"
|
||
b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"
|
||
b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"
|
||
b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"
|
||
b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"
|
||
b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"
|
||
b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"
|
||
b"|\x1a"
|
||
)
|
||
|
||
|
||
# This regex matches C1 control characters, which occupy some of the positions
|
||
# in the Latin-1 character map that Windows assigns to other characters instead.
|
||
C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")
|
||
|
||
|
||
# A translate mapping that breaks ligatures made of Latin letters. While
|
||
# ligatures may be important to the representation of other languages, in Latin
|
||
# letters they tend to represent a copy/paste error. It omits ligatures such
|
||
# as æ that are frequently used intentionally.
|
||
#
|
||
# This list additionally includes some Latin digraphs that represent two
|
||
# characters for legacy encoding reasons, not for typographical reasons.
|
||
#
|
||
# Ligatures and digraphs may also be separated by NFKC normalization, but that
|
||
# is sometimes more normalization than you want.
|
||
|
||
LIGATURES = {
|
||
ord("IJ"): "IJ", # Dutch ligatures
|
||
ord("ij"): "ij",
|
||
ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote
|
||
ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion
|
||
ord("Dz"): "Dz",
|
||
ord("dz"): "dz",
|
||
ord("DŽ"): "DŽ",
|
||
ord("Dž"): "Dž",
|
||
ord("dž"): "dž",
|
||
ord("LJ"): "LJ",
|
||
ord("Lj"): "Lj",
|
||
ord("lj"): "lj",
|
||
ord("NJ"): "NJ",
|
||
ord("Nj"): "Nj",
|
||
ord("nj"): "nj",
|
||
ord("ff"): "ff", # Latin typographical ligatures
|
||
ord("fi"): "fi",
|
||
ord("fl"): "fl",
|
||
ord("ffi"): "ffi",
|
||
ord("ffl"): "ffl",
|
||
ord("ſt"): "ſt",
|
||
ord("st"): "st",
|
||
}
|
||
|
||
|
||
def _build_width_map():
|
||
"""
|
||
Build a translate mapping that replaces halfwidth and fullwidth forms
|
||
with their standard-width forms.
|
||
"""
|
||
# Though it's not listed as a fullwidth character, we'll want to convert
|
||
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
|
||
# with that in the dictionary.
|
||
width_map = {0x3000: " "}
|
||
for i in range(0xFF01, 0xFFF0):
|
||
char = chr(i)
|
||
alternate = unicodedata.normalize("NFKC", char)
|
||
if alternate != char:
|
||
width_map[i] = alternate
|
||
return width_map
|
||
|
||
|
||
WIDTH_MAP = _build_width_map()
|
||
|
||
|
||
# Character classes that help us pinpoint embedded mojibake. These can
|
||
# include common characters, because we'll also check them for 'badness'.
|
||
UTF8_CLUES = {
|
||
# Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
|
||
"utf8_first_of_2": (
|
||
"ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ"
|
||
"ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
|
||
),
|
||
# Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
|
||
"utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"),
|
||
# Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
|
||
# (Other leading bytes correspond only to unassigned codepoints)
|
||
"utf8_first_of_4": ("ðóđğπσру"),
|
||
# Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
|
||
# including a space standing in for 0xA0
|
||
"utf8_continuation": (
|
||
"\x80-\xbf"
|
||
"ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
|
||
"ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
|
||
"–—―‘’‚“”„†‡•…‰‹›€№™"
|
||
" "
|
||
),
|
||
# Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
|
||
# and don't usually stand for themselves when adjacent to mojibake.
|
||
# This excludes spaces, dashes, quotation marks, and ellipses.
|
||
"utf8_continuation_strict": (
|
||
"\x80-\xbf"
|
||
"ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
|
||
"ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
|
||
"†‡•‰‹›€№™"
|
||
),
|
||
}
|
||
|
||
# This regex uses UTF8_CLUES to find sequences of likely mojibake.
|
||
# It matches them with + so that several adjacent UTF-8-looking sequences
|
||
# get coalesced into one, allowing them to be fixed more efficiently
|
||
# and not requiring every individual subsequence to be detected as 'badness'.
|
||
#
|
||
# We accept spaces in place of "utf8_continuation", because spaces might have
|
||
# been intended to be U+A0 NO-BREAK SPACE.
|
||
#
|
||
# We do a lookbehind to make sure the previous character isn't a
|
||
# "utf8_continuation_strict" character, so that we don't fix just a few
|
||
# characters in a huge garble and make the situation worse.
|
||
#
|
||
# Unfortunately, the matches to this regular expression won't show their
|
||
# surrounding context, and including context would make the expression much
|
||
# less efficient. The 'badness' rules that require context, such as a preceding
|
||
# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
|
||
# fixed when they don't see it.
|
||
UTF8_DETECTOR_RE = re.compile(
|
||
"""
|
||
(?<! [{utf8_continuation_strict}])
|
||
(
|
||
[{utf8_first_of_2}] [{utf8_continuation}]
|
||
|
|
||
[{utf8_first_of_3}] [{utf8_continuation}]{{2}}
|
||
|
|
||
[{utf8_first_of_4}] [{utf8_continuation}]{{3}}
|
||
)+
|
||
""".format(
|
||
**UTF8_CLUES
|
||
),
|
||
re.VERBOSE,
|
||
)
|