mirror of https://github.com/morpheus65535/bazarr
215 lines
7.7 KiB
Python
215 lines
7.7 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
This gives other modules access to the gritty details about characters and the
|
||
encodings that use them.
|
||
"""
|
||
|
||
import re
|
||
import zlib
|
||
import unicodedata
|
||
import itertools
|
||
from pkg_resources import resource_string
|
||
from ftfy.compatibility import unichr
|
||
|
||
# These are the encodings we will try to fix in ftfy, in the
|
||
# order that they should be tried.
|
||
CHARMAP_ENCODINGS = [
|
||
u'latin-1',
|
||
u'sloppy-windows-1252',
|
||
u'sloppy-windows-1250',
|
||
u'iso-8859-2',
|
||
u'sloppy-windows-1251',
|
||
u'macroman',
|
||
u'cp437',
|
||
]
|
||
|
||
|
||
def _build_regexes():
|
||
"""
|
||
ENCODING_REGEXES contain reasonably fast ways to detect if we
|
||
could represent a given string in a given encoding. The simplest one is
|
||
the u'ascii' detector, which of course just determines if all characters
|
||
are between U+0000 and U+007F.
|
||
"""
|
||
# Define a regex that matches ASCII text.
|
||
encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}
|
||
|
||
for encoding in CHARMAP_ENCODINGS:
|
||
# Make a sequence of characters that bytes \x80 to \xFF decode to
|
||
# in each encoding, as well as byte \x1A, which is used to represent
|
||
# the replacement character <20> in the sloppy-* encodings.
|
||
latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
|
||
charlist = latin1table.encode(u'latin-1').decode(encoding)
|
||
|
||
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
|
||
# to \x7F -- will decode as those ASCII characters in any encoding we
|
||
# support, so we can just include them as ranges. This also lets us
|
||
# not worry about escaping regex special characters, because all of
|
||
# them are in the \x1B to \x7F range.
|
||
regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
|
||
encoding_regexes[encoding] = re.compile(regex)
|
||
return encoding_regexes
|
||
ENCODING_REGEXES = _build_regexes()
|
||
|
||
|
||
def _build_utf8_punct_regex():
|
||
"""
|
||
Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
|
||
rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
|
||
the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
|
||
Windows-1252.
|
||
|
||
These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
|
||
all begin with when decoded as Windows-1252.
|
||
"""
|
||
# We're making a regex that has all the literal bytes from 0x80 to 0xbf in
|
||
# a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
|
||
# However, when we decode the regex as Windows-1252, the resulting
|
||
# characters won't even be remotely contiguous.
|
||
#
|
||
# Unrelatedly, the expression that generates these bytes will be so much
|
||
# prettier when we deprecate Python 2.
|
||
continuation_char_list = ''.join(
|
||
unichr(i) for i in range(0x80, 0xc0)
|
||
).encode(u'latin-1')
|
||
obvious_utf8 = (u'â€['
|
||
+ continuation_char_list.decode(u'sloppy-windows-1252')
|
||
+ u']')
|
||
return re.compile(obvious_utf8)
|
||
PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
|
||
|
||
|
||
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
|
||
# that some Windows-1252 program converted to a plain space.
|
||
#
|
||
# The smaller values are included on a case-by-case basis, because we don't want
|
||
# to decode likely input sequences to unlikely characters. These are the ones
|
||
# that *do* form likely characters before 0xa0:
|
||
#
|
||
# 0xc2 -> U+A0 NO-BREAK SPACE
|
||
# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
|
||
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
|
||
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
|
||
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
|
||
#
|
||
# These still need to come with a cost, so that they only get converted when
|
||
# there's evidence that it fixes other things. Any of these could represent
|
||
# characters that legitimately appear surrounded by spaces, particularly U+C5
|
||
# (Å), which is a word in multiple languages!
|
||
#
|
||
# We should consider checking for b'\x85' being converted to ... in the future.
|
||
# I've seen it once, but the text still wasn't recoverable.
|
||
|
||
ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
|
||
b'|[\xe0-\xef][ ][\x80-\xbf]'
|
||
b'|[\xe0-\xef][\x80-\xbf][ ]'
|
||
b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
|
||
b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
|
||
b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
|
||
|
||
# This expression matches UTF-8 and CESU-8 sequences where some of the
|
||
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
|
||
# used within ftfy to represent a byte that produced the replacement character
|
||
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
|
||
# sequence as \ufffd instead of failing to re-decode it at all.
|
||
LOSSY_UTF8_RE = re.compile(
|
||
b'[\xc2-\xdf][\x1a]'
|
||
b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]'
|
||
b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]'
|
||
b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]'
|
||
b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]'
|
||
b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]'
|
||
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]'
|
||
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]'
|
||
b'|\x1a'
|
||
)
|
||
|
||
# These regexes match various Unicode variations on single and double quotes.
|
||
SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
|
||
DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')
|
||
|
||
|
||
def possible_encoding(text, encoding):
|
||
"""
|
||
Given text and a single-byte encoding, check whether that text could have
|
||
been decoded from that single-byte encoding.
|
||
|
||
In other words, check whether it can be encoded in that encoding, possibly
|
||
sloppily.
|
||
"""
|
||
return bool(ENCODING_REGEXES[encoding].match(text))
|
||
|
||
|
||
CHAR_CLASS_STRING = zlib.decompress(
|
||
resource_string(__name__, 'char_classes.dat')
|
||
).decode(u'ascii')
|
||
|
||
def chars_to_classes(string):
|
||
"""
|
||
Convert each Unicode character to a letter indicating which of many
|
||
classes it's in.
|
||
|
||
See build_data.py for where this data comes from and what it means.
|
||
"""
|
||
return string.translate(CHAR_CLASS_STRING)
|
||
|
||
|
||
def _build_control_char_mapping():
|
||
"""
|
||
Build a translate mapping that strips likely-unintended control characters.
|
||
See :func:`ftfy.fixes.remove_control_chars` for a description of these
|
||
codepoint ranges and why they should be removed.
|
||
"""
|
||
control_chars = {}
|
||
|
||
for i in itertools.chain(
|
||
range(0x00, 0x09), [0x0b],
|
||
range(0x0e, 0x20), [0x7f],
|
||
range(0x206a, 0x2070),
|
||
[0xfeff],
|
||
range(0xfff9, 0xfffd),
|
||
range(0x1d173, 0x1d17b),
|
||
range(0xe0000, 0xe0080)
|
||
):
|
||
control_chars[i] = None
|
||
|
||
return control_chars
|
||
CONTROL_CHARS = _build_control_char_mapping()
|
||
|
||
|
||
# A translate mapping that breaks ligatures made of Latin letters. While
|
||
# ligatures may be important to the representation of other languages, in
|
||
# Latin letters they tend to represent a copy/paste error.
|
||
#
|
||
# Ligatures may also be separated by NFKC normalization, but that is sometimes
|
||
# more normalization than you want.
|
||
LIGATURES = {
|
||
ord(u'IJ'): u'IJ',
|
||
ord(u'ij'): u'ij',
|
||
ord(u'ff'): u'ff',
|
||
ord(u'fi'): u'fi',
|
||
ord(u'fl'): u'fl',
|
||
ord(u'ffi'): u'ffi',
|
||
ord(u'ffl'): u'ffl',
|
||
ord(u'ſt'): u'ſt',
|
||
ord(u'st'): u'st'
|
||
}
|
||
|
||
|
||
def _build_width_map():
|
||
"""
|
||
Build a translate mapping that replaces halfwidth and fullwidth forms
|
||
with their standard-width forms.
|
||
"""
|
||
# Though it's not listed as a fullwidth character, we'll want to convert
|
||
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
|
||
# with that in the dictionary.
|
||
width_map = {0x3000: u' '}
|
||
for i in range(0xff01, 0xfff0):
|
||
char = unichr(i)
|
||
alternate = unicodedata.normalize(u'NFKC', char)
|
||
if alternate != char:
|
||
width_map[i] = alternate
|
||
return width_map
|
||
WIDTH_MAP = _build_width_map()
|