diff options
Diffstat (limited to 'libs/ftfy/build_data.py')
-rw-r--r-- | libs/ftfy/build_data.py | 132 |
1 files changed, 0 insertions, 132 deletions
diff --git a/libs/ftfy/build_data.py b/libs/ftfy/build_data.py deleted file mode 100644 index 8269d2ee1..000000000 --- a/libs/ftfy/build_data.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -A script to make the char_classes.dat file. - -This never needs to run in normal usage. It needs to be run if the character -classes we care about change, or if a new version of Python supports a new -Unicode standard and we want it to affect our string decoding. - -The file that we generate is based on Unicode 9.0, as supported by Python 3.6. -You can certainly use it in earlier versions. This simply makes sure that we -get consistent results from running ftfy on different versions of Python. - -The file will be written to the current directory. -""" -from __future__ import unicode_literals -import unicodedata -import sys -import zlib -if sys.hexversion >= 0x03000000: - unichr = chr - -# L = Latin capital letter -# l = Latin lowercase letter -# A = Non-latin capital or title-case letter -# a = Non-latin lowercase letter -# C = Non-cased letter (Lo) -# X = Control character (Cc) -# m = Letter modifier (Lm) -# M = Mark (Mc, Me, Mn) -# N = Miscellaneous numbers (No) -# P = Private use (Co) -# 1 = Math symbol (Sm) or currency symbol (Sc) -# 2 = Symbol modifier (Sk) -# 3 = Other symbol (So) -# S = UTF-16 surrogate -# _ = Unassigned character -# = Whitespace -# o = Other - - -def make_char_data_file(do_it_anyway=False): - """ - Build the compressed data file 'char_classes.dat' and write it to the - current directory. - - If you run this, run it in Python 3.6 or later. It will run in earlier - versions, but you won't get the Unicode 9 standard, leading to inconsistent - behavior. - - To protect against this, running this in the wrong version of Python will - raise an error unless you pass `do_it_anyway=True`. - """ - if sys.hexversion < 0x03060000 and not do_it_anyway: - raise RuntimeError( - "This function should be run in Python 3.6 or later." - ) - - cclasses = [None] * 0x110000 - for codepoint in range(0x0, 0x110000): - char = unichr(codepoint) - category = unicodedata.category(char) - - if (0x250 <= codepoint < 0x300) and char != 'ə': - # IPA symbols and modifiers. - # - # This category excludes the schwa (ə), which is used as a normal - # Latin letter in some languages. - cclasses[codepoint] = 'i' - elif category.startswith('L'): # letters - if unicodedata.name(char, '').startswith('LATIN'): - if category == 'Lu': - cclasses[codepoint] = 'L' - else: - cclasses[codepoint] = 'l' - else: - if category == 'Lu' or category == 'Lt': - cclasses[codepoint] = 'A' - elif category == 'Ll': - cclasses[codepoint] = 'a' - elif category == 'Lo': - cclasses[codepoint] = 'C' - elif category == 'Lm': - cclasses[codepoint] = 'm' - else: - raise ValueError('got some weird kind of letter') - elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff: - # Variation selectors and skin-tone modifiers have the category - # of non-spacing marks, but they act like symbols - cclasses[codepoint] = '3' - elif category.startswith('M'): # marks - cclasses[codepoint] = 'M' - elif category == 'No': - cclasses[codepoint] = 'N' - elif category == 'Sm' or category == 'Sc': - cclasses[codepoint] = '1' - elif category == 'Sk': - cclasses[codepoint] = '2' - elif category == 'So': - cclasses[codepoint] = '3' - elif category == 'Cc': - cclasses[codepoint] = 'X' - elif category == 'Cs': - cclasses[codepoint] = 'S' - elif category == 'Co': - cclasses[codepoint] = 'P' - elif category.startswith('Z'): - cclasses[codepoint] = ' ' - elif 0x1f000 <= codepoint <= 0x1ffff: - # This range is rapidly having emoji added to it. Assume that - # an unassigned codepoint in this range is just a symbol we - # don't know yet. - cclasses[codepoint] = '3' - elif category == 'Cn': - cclasses[codepoint] = '_' - else: - cclasses[codepoint] = 'o' - - # Mark whitespace control characters as whitespace - cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' ' - - # Some other exceptions for characters that are more commonly used as - # punctuation or decoration than for their ostensible purpose. - # For example, tilde is not usually a "math symbol", and the accents - # `´ are much more like quotation marks than modifiers. - for char in "^~`´˝^`": - cclasses[ord(char)] = 'o' - - out = open('char_classes.dat', 'wb') - out.write(zlib.compress(''.join(cclasses).encode('ascii'))) - out.close() - -if __name__ == '__main__': - make_char_data_file() |