bazarr/libs/ftfy/chardata.py

215 lines
7.7 KiB
Python
Raw Permalink Normal View History

2018-10-31 16:08:29 +00:00
# -*- coding: utf-8 -*-
"""
This gives other modules access to the gritty details about characters and the
encodings that use them.
"""
import re
import zlib
import unicodedata
import itertools
from pkg_resources import resource_string
from ftfy.compatibility import unichr
# These are the encodings we will try to fix in ftfy, in the
# order that they should be tried.
CHARMAP_ENCODINGS = [
u'latin-1',
u'sloppy-windows-1252',
u'sloppy-windows-1250',
u'iso-8859-2',
u'sloppy-windows-1251',
u'macroman',
u'cp437',
]
def _build_regexes():
"""
ENCODING_REGEXES contain reasonably fast ways to detect if we
could represent a given string in a given encoding. The simplest one is
the u'ascii' detector, which of course just determines if all characters
are between U+0000 and U+007F.
"""
# Define a regex that matches ASCII text.
encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}
for encoding in CHARMAP_ENCODINGS:
# Make a sequence of characters that bytes \x80 to \xFF decode to
# in each encoding, as well as byte \x1A, which is used to represent
# the replacement character <20> in the sloppy-* encodings.
latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
charlist = latin1table.encode(u'latin-1').decode(encoding)
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
# to \x7F -- will decode as those ASCII characters in any encoding we
# support, so we can just include them as ranges. This also lets us
# not worry about escaping regex special characters, because all of
# them are in the \x1B to \x7F range.
regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
encoding_regexes[encoding] = re.compile(regex)
return encoding_regexes
ENCODING_REGEXES = _build_regexes()
def _build_utf8_punct_regex():
"""
Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
Windows-1252.
These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
all begin with when decoded as Windows-1252.
"""
# We're making a regex that has all the literal bytes from 0x80 to 0xbf in
# a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
# However, when we decode the regex as Windows-1252, the resulting
# characters won't even be remotely contiguous.
#
# Unrelatedly, the expression that generates these bytes will be so much
# prettier when we deprecate Python 2.
continuation_char_list = ''.join(
unichr(i) for i in range(0x80, 0xc0)
).encode(u'latin-1')
obvious_utf8 = (u'â€['
+ continuation_char_list.decode(u'sloppy-windows-1252')
+ u']')
return re.compile(obvious_utf8)
PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
# that some Windows-1252 program converted to a plain space.
#
# The smaller values are included on a case-by-case basis, because we don't want
# to decode likely input sequences to unlikely characters. These are the ones
# that *do* form likely characters before 0xa0:
#
# 0xc2 -> U+A0 NO-BREAK SPACE
# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
#
# These still need to come with a cost, so that they only get converted when
# there's evidence that it fixes other things. Any of these could represent
# characters that legitimately appear surrounded by spaces, particularly U+C5
# (Å), which is a word in multiple languages!
#
# We should consider checking for b'\x85' being converted to ... in the future.
# I've seen it once, but the text still wasn't recoverable.
ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
b'|[\xe0-\xef][ ][\x80-\xbf]'
b'|[\xe0-\xef][\x80-\xbf][ ]'
b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
# This expression matches UTF-8 and CESU-8 sequences where some of the
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
# used within ftfy to represent a byte that produced the replacement character
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
# sequence as \ufffd instead of failing to re-decode it at all.
LOSSY_UTF8_RE = re.compile(
b'[\xc2-\xdf][\x1a]'
b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]'
b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]'
b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]'
b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]'
b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]'
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]'
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]'
b'|\x1a'
)
# These regexes match various Unicode variations on single and double quotes.
SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')
def possible_encoding(text, encoding):
"""
Given text and a single-byte encoding, check whether that text could have
been decoded from that single-byte encoding.
In other words, check whether it can be encoded in that encoding, possibly
sloppily.
"""
return bool(ENCODING_REGEXES[encoding].match(text))
CHAR_CLASS_STRING = zlib.decompress(
resource_string(__name__, 'char_classes.dat')
).decode(u'ascii')
def chars_to_classes(string):
"""
Convert each Unicode character to a letter indicating which of many
classes it's in.
See build_data.py for where this data comes from and what it means.
"""
return string.translate(CHAR_CLASS_STRING)
def _build_control_char_mapping():
"""
Build a translate mapping that strips likely-unintended control characters.
See :func:`ftfy.fixes.remove_control_chars` for a description of these
codepoint ranges and why they should be removed.
"""
control_chars = {}
for i in itertools.chain(
range(0x00, 0x09), [0x0b],
range(0x0e, 0x20), [0x7f],
range(0x206a, 0x2070),
[0xfeff],
range(0xfff9, 0xfffd),
range(0x1d173, 0x1d17b),
range(0xe0000, 0xe0080)
):
control_chars[i] = None
return control_chars
CONTROL_CHARS = _build_control_char_mapping()
# A translate mapping that breaks ligatures made of Latin letters. While
# ligatures may be important to the representation of other languages, in
# Latin letters they tend to represent a copy/paste error.
#
# Ligatures may also be separated by NFKC normalization, but that is sometimes
# more normalization than you want.
LIGATURES = {
ord(u'IJ'): u'IJ',
ord(u'ij'): u'ij',
ord(u''): u'ff',
ord(u''): u'fi',
ord(u''): u'fl',
ord(u''): u'ffi',
ord(u''): u'ffl',
ord(u''): u'ſt',
ord(u''): u'st'
}
def _build_width_map():
"""
Build a translate mapping that replaces halfwidth and fullwidth forms
with their standard-width forms.
"""
# Though it's not listed as a fullwidth character, we'll want to convert
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
# with that in the dictionary.
width_map = {0x3000: u' '}
for i in range(0xff01, 0xfff0):
char = unichr(i)
alternate = unicodedata.normalize(u'NFKC', char)
if alternate != char:
width_map[i] = alternate
return width_map
WIDTH_MAP = _build_width_map()