mirror of
https://github.com/morpheus65535/bazarr
synced 2024-12-25 17:17:12 +00:00
164 lines
6.5 KiB
Python
164 lines
6.5 KiB
Python
# coding: utf-8
|
||
r"""
|
||
Decodes single-byte encodings, filling their "holes" in the same messy way that
|
||
everyone else does.
|
||
|
||
A single-byte encoding maps each byte to a Unicode character, except that some
|
||
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
|
||
example, bytes 0x81 and 0x8D, among others, have no meaning.
|
||
|
||
Python, wanting to preserve some sense of decorum, will handle these bytes
|
||
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
|
||
different from each other. It just hasn't defined what they are in terms of
|
||
Unicode.
|
||
|
||
Software that has to interoperate with Windows-1252 and Unicode -- such as all
|
||
the common Web browsers -- will pick some Unicode characters for them to map
|
||
to, and the characters they pick are the Unicode characters with the same
|
||
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
|
||
resulting characters tend to fall into a range of Unicode that's set aside for
|
||
obselete Latin-1 control characters anyway.
|
||
|
||
These sloppy codecs let Python do the same thing, thus interoperating with
|
||
other software that works this way. It defines a sloppy version of many
|
||
single-byte encodings with holes. (There is no need for a sloppy version of
|
||
an encoding without holes: for example, there is no such thing as
|
||
sloppy-iso-8859-2 or sloppy-macroman.)
|
||
|
||
The following encodings will become defined:
|
||
|
||
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
|
||
- sloppy-windows-1251 (Cyrillic)
|
||
- sloppy-windows-1252 (Western European, based on Latin-1)
|
||
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
|
||
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
|
||
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
|
||
- sloppy-windows-1256 (Arabic)
|
||
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
|
||
- sloppy-windows-1258 (Vietnamese)
|
||
- sloppy-cp874 (Thai, based on ISO-8859-11)
|
||
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
|
||
- sloppy-iso-8859-6 (different Arabic)
|
||
- sloppy-iso-8859-7 (Greek)
|
||
- sloppy-iso-8859-8 (Hebrew)
|
||
- sloppy-iso-8859-11 (Thai)
|
||
|
||
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
|
||
defined.
|
||
|
||
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
|
||
the rest are rather uncommon.
|
||
|
||
Here are some examples, using `ftfy.explain_unicode` to illustrate how
|
||
sloppy-windows-1252 merges Windows-1252 with Latin-1:
|
||
|
||
>>> from ftfy import explain_unicode
|
||
>>> some_bytes = b'\x80\x81\x82'
|
||
>>> explain_unicode(some_bytes.decode('latin-1'))
|
||
U+0080 \x80 [Cc] <unknown>
|
||
U+0081 \x81 [Cc] <unknown>
|
||
U+0082 \x82 [Cc] <unknown>
|
||
|
||
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
|
||
U+20AC € [Sc] EURO SIGN
|
||
U+FFFD <20> [So] REPLACEMENT CHARACTER
|
||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
||
|
||
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
|
||
U+20AC € [Sc] EURO SIGN
|
||
U+0081 \x81 [Cc] <unknown>
|
||
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
|
||
"""
|
||
from __future__ import unicode_literals
|
||
import codecs
|
||
from encodings import normalize_encoding
|
||
import sys
|
||
|
||
REPLACEMENT_CHAR = '\ufffd'
|
||
PY26 = sys.version_info[:2] == (2, 6)
|
||
|
||
def make_sloppy_codec(encoding):
|
||
"""
|
||
Take a codec name, and return a 'sloppy' version of that codec that can
|
||
encode and decode the unassigned bytes in that encoding.
|
||
|
||
Single-byte encodings in the standard library are defined using some
|
||
boilerplate classes surrounding the functions that do the actual work,
|
||
`codecs.charmap_decode` and `charmap_encode`. This function, given an
|
||
encoding name, *defines* those boilerplate classes.
|
||
"""
|
||
# Make an array of all 256 possible bytes.
|
||
all_bytes = bytearray(range(256))
|
||
|
||
# Get a list of what they would decode to in Latin-1.
|
||
sloppy_chars = list(all_bytes.decode('latin-1'))
|
||
|
||
# Get a list of what they decode to in the given encoding. Use the
|
||
# replacement character for unassigned bytes.
|
||
if PY26:
|
||
decoded_chars = all_bytes.decode(encoding, 'replace')
|
||
else:
|
||
decoded_chars = all_bytes.decode(encoding, errors='replace')
|
||
|
||
# Update the sloppy_chars list. Each byte that was successfully decoded
|
||
# gets its decoded value in the list. The unassigned bytes are left as
|
||
# they are, which gives their decoding in Latin-1.
|
||
for i, char in enumerate(decoded_chars):
|
||
if char != REPLACEMENT_CHAR:
|
||
sloppy_chars[i] = char
|
||
|
||
# For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
|
||
# control code, to encode the Unicode replacement character U+FFFD.
|
||
sloppy_chars[0x1a] = REPLACEMENT_CHAR
|
||
|
||
# Create the data structures that tell the charmap methods how to encode
|
||
# and decode in this sloppy encoding.
|
||
decoding_table = ''.join(sloppy_chars)
|
||
encoding_table = codecs.charmap_build(decoding_table)
|
||
|
||
# Now produce all the class boilerplate. Look at the Python source for
|
||
# `encodings.cp1252` for comparison; this is almost exactly the same,
|
||
# except I made it follow pep8.
|
||
class Codec(codecs.Codec):
|
||
def encode(self, input, errors='strict'):
|
||
return codecs.charmap_encode(input, errors, encoding_table)
|
||
|
||
def decode(self, input, errors='strict'):
|
||
return codecs.charmap_decode(input, errors, decoding_table)
|
||
|
||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||
def encode(self, input, final=False):
|
||
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
|
||
|
||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
||
def decode(self, input, final=False):
|
||
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
|
||
|
||
class StreamWriter(Codec, codecs.StreamWriter):
|
||
pass
|
||
|
||
class StreamReader(Codec, codecs.StreamReader):
|
||
pass
|
||
|
||
return codecs.CodecInfo(
|
||
name='sloppy-' + encoding,
|
||
encode=Codec().encode,
|
||
decode=Codec().decode,
|
||
incrementalencoder=IncrementalEncoder,
|
||
incrementaldecoder=IncrementalDecoder,
|
||
streamreader=StreamReader,
|
||
streamwriter=StreamWriter,
|
||
)
|
||
|
||
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
|
||
# can be used by the main module of ftfy.bad_codecs.
|
||
CODECS = {}
|
||
INCOMPLETE_ENCODINGS = (
|
||
['windows-%s' % num for num in range(1250, 1259)] +
|
||
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
|
||
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
|
||
)
|
||
|
||
for _encoding in INCOMPLETE_ENCODINGS:
|
||
_new_name = normalize_encoding('sloppy-' + _encoding)
|
||
CODECS[_new_name] = make_sloppy_codec(_encoding)
|