# -*- coding: utf-8 -*- """ ftfy: fixes text for you This is a module for making text less broken. See the `fix_text` function for more information. """ from __future__ import unicode_literals import unicodedata import ftfy.bad_codecs from ftfy import fixes from ftfy.formatting import display_ljust from ftfy.compatibility import is_printable __version__ = '4.4.3' # See the docstring for ftfy.bad_codecs to see what we're doing here. ftfy.bad_codecs.ok() def fix_text(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC', max_decode_length=10**6): r""" Given Unicode text as input, fix inconsistencies and glitches in it, such as mojibake. Let's start with some examples: >>> print(fix_text('ünicode')) ünicode >>> print(fix_text('Broken text… it’s flubberific!', ... normalization='NFKC')) Broken text... it's flubberific! >>> print(fix_text('HTML entities <3')) HTML entities <3 >>> print(fix_text('HTML entities <3')) HTML entities <3 >>> print(fix_text("¯\\_(ã\x83\x84)_/¯")) ¯\_(ツ)_/¯ >>> # This example string starts with a byte-order mark, even if >>> # you can't see it on the Web. >>> print(fix_text('\ufeffParty like\nit’s 1999!')) Party like it's 1999! >>> print(fix_text('LOUD NOISES')) LOUD NOISES >>> len(fix_text('fi' * 100000)) 200000 >>> len(fix_text('')) 0 Based on the options you provide, ftfy applies these steps in order: - If `remove_terminal_escapes` is True, remove sequences of bytes that are instructions for Unix terminals, such as the codes that make text appear in different colors. - If `fix_encoding` is True, look for common mistakes that come from encoding or decoding Unicode text incorrectly, and fix them if they are reasonably fixable. See `fixes.fix_encoding` for details. - If `fix_entities` is True, replace HTML entities with their equivalent characters. If it's "auto" (the default), then consider replacing HTML entities, but don't do so in text where you have seen a pair of actual angle brackets (that's probably actually HTML and you shouldn't mess with the entities). - If `uncurl_quotes` is True, replace various curly quotation marks with plain-ASCII straight quotes. - If `fix_latin_ligatures` is True, then ligatures made of Latin letters, such as `fi`, will be separated into individual letters. These ligatures are usually not meaningful outside of font rendering, and often represent copy-and-paste errors. - If `fix_character_width` is True, half-width and full-width characters will be replaced by their standard-width form. - If `fix_line_breaks` is true, convert all line breaks to Unix style (CRLF and CR line breaks become LF line breaks). - If `fix_surrogates` is true, ensure that there are no UTF-16 surrogates in the resulting string, by converting them to the correct characters when they're appropriately paired, or replacing them with \ufffd otherwise. - If `remove_control_chars` is true, remove control characters that are not suitable for use in text. This includes most of the ASCII control characters, plus some Unicode controls such as the byte order mark (U+FEFF). Useful control characters, such as Tab, Line Feed, and bidirectional marks, are left as they are. - If `remove_bom` is True, remove the Byte-Order Mark at the start of the string if it exists. (This is largely redundant, because it's a special case of `remove_control_characters`. This option will become deprecated in a later version.) - If `normalization` is not None, apply the specified form of Unicode normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'. - The default normalization, NFC, combines characters and diacritics that are written using separate code points, such as converting "e" plus an acute accent modifier into "é", or converting "ka" (か) plus a dakuten into the single character "ga" (が). Unicode can be converted to NFC form without any change in its meaning. - If you ask for NFKC normalization, it will apply additional normalizations that can change the meanings of characters. For example, ellipsis characters will be replaced with three periods, all ligatures will be replaced with the individual characters that make them up, and characters that differ in font style will be converted to the same character. - If anything was changed, repeat all the steps, so that the function is idempotent. "&amp;" will become "&", for example, not "&". `fix_text` will work one line at a time, with the possibility that some lines are in different encodings, allowing it to fix text that has been concatenated together from different sources. When it encounters lines longer than `max_decode_length` (1 million codepoints by default), it will not run the `fix_encoding` step, to avoid unbounded slowdowns. If you're certain that any decoding errors in the text would have affected the entire text in the same way, and you don't mind operations that scale with the length of the text, you can use `fix_text_segment` directly to fix the whole string in one batch. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) out = [] pos = 0 while pos < len(text): textbreak = text.find('\n', pos) + 1 fix_encoding_this_time = fix_encoding if textbreak == 0: textbreak = len(text) if (textbreak - pos) > max_decode_length: fix_encoding_this_time = False substring = text[pos:textbreak] if fix_entities == 'auto' and '<' in substring and '>' in substring: # we see angle brackets together; this could be HTML fix_entities = False out.append( fix_text_segment( substring, fix_entities=fix_entities, remove_terminal_escapes=remove_terminal_escapes, fix_encoding=fix_encoding_this_time, uncurl_quotes=uncurl_quotes, fix_latin_ligatures=fix_latin_ligatures, fix_character_width=fix_character_width, fix_line_breaks=fix_line_breaks, fix_surrogates=fix_surrogates, remove_control_chars=remove_control_chars, remove_bom=remove_bom, normalization=normalization ) ) pos = textbreak return ''.join(out) # Some alternate names for the main functions ftfy = fix_text fix_encoding = fixes.fix_encoding fix_text_encoding = fixes.fix_text_encoding # deprecated def fix_file(input_file, encoding=None, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Fix text that is found in a file. If the file is being read as Unicode text, use that. If it's being read as bytes, then we hope an encoding was supplied. If not, unfortunately, we have to guess what encoding it is. We'll try a few common encodings, but we make no promises. See the `guess_bytes` function for how this is done. The output is a stream of fixed lines of text. """ entities = fix_entities for line in input_file: if isinstance(line, bytes): if encoding is None: line, encoding = guess_bytes(line) else: line = line.decode(encoding) if fix_entities == 'auto' and '<' in line and '>' in line: entities = False yield fix_text_segment( line, fix_entities=entities, remove_terminal_escapes=remove_terminal_escapes, fix_encoding=fix_encoding, fix_latin_ligatures=fix_latin_ligatures, fix_character_width=fix_character_width, uncurl_quotes=uncurl_quotes, fix_line_breaks=fix_line_breaks, fix_surrogates=fix_surrogates, remove_control_chars=remove_control_chars, remove_bom=remove_bom, normalization=normalization ) def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text def guess_bytes(bstring): """ NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy is not designed to be an encoding detector. In the unfortunate situation that you have some bytes in an unknown encoding, ftfy can guess a reasonable strategy for decoding them, by trying a few common encodings that can be distinguished from each other. Unlike the rest of ftfy, this may not be accurate, and it may *create* Unicode problems instead of solving them! It doesn't try East Asian encodings at all, and if you have East Asian text that you don't know how to decode, you are somewhat out of luck. East Asian encodings require some serious statistics to distinguish from each other, so we can't support them without decreasing the accuracy of ftfy. If you don't know which encoding you have at all, I recommend trying the 'chardet' module, and being appropriately skeptical about its results. The encodings we try here are: - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks like nothing else - UTF-8, because it's the global standard, which has been used by a majority of the Web since 2008 - "utf-8-variants", because it's what people actually implement when they think they're doing UTF-8 - MacRoman, because Microsoft Office thinks it's still a thing, and it can be distinguished by its line breaks. (If there are no line breaks in the string, though, you're out of luck.) - "sloppy-windows-1252", the Latin-1-like encoding that is the most common single-byte encoding """ if type(bstring) == type(''): raise UnicodeError( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'): return bstring.decode('utf-16'), 'utf-16' byteset = set(bytes(bstring)) byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n' try: if byte_ed in byteset or byte_c0 in byteset: # Byte 0xed can be used to encode a range of codepoints that # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, # so when we see 0xed, it's very likely we're being asked to # decode CESU-8, the variant that encodes UTF-16 surrogates # instead of the original characters themselves. # # This will occasionally trigger on standard UTF-8, as there # are some Korean characters that also use byte 0xed, but that's # not harmful. # # Byte 0xc0 is impossible because, numerically, it would only # encode characters lower than U+0040. Those already have # single-byte representations, and UTF-8 requires using the # shortest possible representation. However, Java hides the null # codepoint, U+0000, in a non-standard longer representation -- it # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00 # will never appear in the encoded bytes. # # The 'utf-8-variants' decoder can handle both of these cases, as # well as standard UTF-8, at the cost of a bit of speed. return bstring.decode('utf-8-variants'), 'utf-8-variants' else: return bstring.decode('utf-8'), 'utf-8' except UnicodeDecodeError: pass if byte_CR in bstring and byte_LF not in bstring: return bstring.decode('macroman'), 'macroman' else: return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252' def explain_unicode(text): """ A utility method that's useful for debugging mysterious Unicode. It breaks down a string, showing you for each codepoint its number in hexadecimal, its glyph, its category in the Unicode standard, and its name in the Unicode standard. >>> explain_unicode('(╯°□°)╯︵ ┻━┻') U+0028 ( [Ps] LEFT PARENTHESIS U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT U+00B0 ° [So] DEGREE SIGN U+25A1 □ [So] WHITE SQUARE U+00B0 ° [So] DEGREE SIGN U+0029 ) [Pe] RIGHT PARENTHESIS U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS U+0020 [Zs] SPACE U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL """ for char in text: if is_printable(char): display = char else: display = char.encode('unicode-escape').decode('ascii') print('U+{code:04X} {display} [{category}] {name}'.format( display=display_ljust(display, 7), code=ord(char), category=unicodedata.category(char), name=unicodedata.name(char, '') ))