""" A script to make the char_classes.dat file. This never needs to run in normal usage. It needs to be run if the character classes we care about change, or if a new version of Python supports a new Unicode standard and we want it to affect our string decoding. The file that we generate is based on Unicode 9.0, as supported by Python 3.6. You can certainly use it in earlier versions. This simply makes sure that we get consistent results from running ftfy on different versions of Python. The file will be written to the current directory. """ from __future__ import unicode_literals import unicodedata import sys import zlib if sys.hexversion >= 0x03000000: unichr = chr # L = Latin capital letter # l = Latin lowercase letter # A = Non-latin capital or title-case letter # a = Non-latin lowercase letter # C = Non-cased letter (Lo) # X = Control character (Cc) # m = Letter modifier (Lm) # M = Mark (Mc, Me, Mn) # N = Miscellaneous numbers (No) # P = Private use (Co) # 1 = Math symbol (Sm) or currency symbol (Sc) # 2 = Symbol modifier (Sk) # 3 = Other symbol (So) # S = UTF-16 surrogate # _ = Unassigned character # = Whitespace # o = Other def make_char_data_file(do_it_anyway=False): """ Build the compressed data file 'char_classes.dat' and write it to the current directory. If you run this, run it in Python 3.6 or later. It will run in earlier versions, but you won't get the Unicode 9 standard, leading to inconsistent behavior. To protect against this, running this in the wrong version of Python will raise an error unless you pass `do_it_anyway=True`. """ if sys.hexversion < 0x03060000 and not do_it_anyway: raise RuntimeError( "This function should be run in Python 3.6 or later." ) cclasses = [None] * 0x110000 for codepoint in range(0x0, 0x110000): char = unichr(codepoint) category = unicodedata.category(char) if (0x250 <= codepoint < 0x300) and char != 'ə': # IPA symbols and modifiers. # # This category excludes the schwa (ə), which is used as a normal # Latin letter in some languages. cclasses[codepoint] = 'i' elif category.startswith('L'): # letters if unicodedata.name(char, '').startswith('LATIN'): if category == 'Lu': cclasses[codepoint] = 'L' else: cclasses[codepoint] = 'l' else: if category == 'Lu' or category == 'Lt': cclasses[codepoint] = 'A' elif category == 'Ll': cclasses[codepoint] = 'a' elif category == 'Lo': cclasses[codepoint] = 'C' elif category == 'Lm': cclasses[codepoint] = 'm' else: raise ValueError('got some weird kind of letter') elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff: # Variation selectors and skin-tone modifiers have the category # of non-spacing marks, but they act like symbols cclasses[codepoint] = '3' elif category.startswith('M'): # marks cclasses[codepoint] = 'M' elif category == 'No': cclasses[codepoint] = 'N' elif category == 'Sm' or category == 'Sc': cclasses[codepoint] = '1' elif category == 'Sk': cclasses[codepoint] = '2' elif category == 'So': cclasses[codepoint] = '3' elif category == 'Cc': cclasses[codepoint] = 'X' elif category == 'Cs': cclasses[codepoint] = 'S' elif category == 'Co': cclasses[codepoint] = 'P' elif category.startswith('Z'): cclasses[codepoint] = ' ' elif 0x1f000 <= codepoint <= 0x1ffff: # This range is rapidly having emoji added to it. Assume that # an unassigned codepoint in this range is just a symbol we # don't know yet. cclasses[codepoint] = '3' elif category == 'Cn': cclasses[codepoint] = '_' else: cclasses[codepoint] = 'o' # Mark whitespace control characters as whitespace cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' ' # Some other exceptions for characters that are more commonly used as # punctuation or decoration than for their ostensible purpose. # For example, tilde is not usually a "math symbol", and the accents # `´ are much more like quotation marks than modifiers. for char in "^~`´˝^`": cclasses[ord(char)] = 'o' out = open('char_classes.dat', 'wb') out.write(zlib.compress(''.join(cclasses).encode('ascii'))) out.close() if __name__ == '__main__': make_char_data_file()