summaryrefslogtreecommitdiffhomepage
path: root/libs/ftfy/build_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ftfy/build_data.py')
-rw-r--r--libs/ftfy/build_data.py132
1 files changed, 0 insertions, 132 deletions
diff --git a/libs/ftfy/build_data.py b/libs/ftfy/build_data.py
deleted file mode 100644
index 8269d2ee1..000000000
--- a/libs/ftfy/build_data.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-A script to make the char_classes.dat file.
-
-This never needs to run in normal usage. It needs to be run if the character
-classes we care about change, or if a new version of Python supports a new
-Unicode standard and we want it to affect our string decoding.
-
-The file that we generate is based on Unicode 9.0, as supported by Python 3.6.
-You can certainly use it in earlier versions. This simply makes sure that we
-get consistent results from running ftfy on different versions of Python.
-
-The file will be written to the current directory.
-"""
-from __future__ import unicode_literals
-import unicodedata
-import sys
-import zlib
-if sys.hexversion >= 0x03000000:
- unichr = chr
-
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# P = Private use (Co)
-# 1 = Math symbol (Sm) or currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-# = Whitespace
-# o = Other
-
-
-def make_char_data_file(do_it_anyway=False):
- """
- Build the compressed data file 'char_classes.dat' and write it to the
- current directory.
-
- If you run this, run it in Python 3.6 or later. It will run in earlier
- versions, but you won't get the Unicode 9 standard, leading to inconsistent
- behavior.
-
- To protect against this, running this in the wrong version of Python will
- raise an error unless you pass `do_it_anyway=True`.
- """
- if sys.hexversion < 0x03060000 and not do_it_anyway:
- raise RuntimeError(
- "This function should be run in Python 3.6 or later."
- )
-
- cclasses = [None] * 0x110000
- for codepoint in range(0x0, 0x110000):
- char = unichr(codepoint)
- category = unicodedata.category(char)
-
- if (0x250 <= codepoint < 0x300) and char != 'ə':
- # IPA symbols and modifiers.
- #
- # This category excludes the schwa (ə), which is used as a normal
- # Latin letter in some languages.
- cclasses[codepoint] = 'i'
- elif category.startswith('L'): # letters
- if unicodedata.name(char, '').startswith('LATIN'):
- if category == 'Lu':
- cclasses[codepoint] = 'L'
- else:
- cclasses[codepoint] = 'l'
- else:
- if category == 'Lu' or category == 'Lt':
- cclasses[codepoint] = 'A'
- elif category == 'Ll':
- cclasses[codepoint] = 'a'
- elif category == 'Lo':
- cclasses[codepoint] = 'C'
- elif category == 'Lm':
- cclasses[codepoint] = 'm'
- else:
- raise ValueError('got some weird kind of letter')
- elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff:
- # Variation selectors and skin-tone modifiers have the category
- # of non-spacing marks, but they act like symbols
- cclasses[codepoint] = '3'
- elif category.startswith('M'): # marks
- cclasses[codepoint] = 'M'
- elif category == 'No':
- cclasses[codepoint] = 'N'
- elif category == 'Sm' or category == 'Sc':
- cclasses[codepoint] = '1'
- elif category == 'Sk':
- cclasses[codepoint] = '2'
- elif category == 'So':
- cclasses[codepoint] = '3'
- elif category == 'Cc':
- cclasses[codepoint] = 'X'
- elif category == 'Cs':
- cclasses[codepoint] = 'S'
- elif category == 'Co':
- cclasses[codepoint] = 'P'
- elif category.startswith('Z'):
- cclasses[codepoint] = ' '
- elif 0x1f000 <= codepoint <= 0x1ffff:
- # This range is rapidly having emoji added to it. Assume that
- # an unassigned codepoint in this range is just a symbol we
- # don't know yet.
- cclasses[codepoint] = '3'
- elif category == 'Cn':
- cclasses[codepoint] = '_'
- else:
- cclasses[codepoint] = 'o'
-
- # Mark whitespace control characters as whitespace
- cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
-
- # Some other exceptions for characters that are more commonly used as
- # punctuation or decoration than for their ostensible purpose.
- # For example, tilde is not usually a "math symbol", and the accents
- # `´ are much more like quotation marks than modifiers.
- for char in "^~`´˝^`":
- cclasses[ord(char)] = 'o'
-
- out = open('char_classes.dat', 'wb')
- out.write(zlib.compress(''.join(cclasses).encode('ascii')))
- out.close()
-
-if __name__ == '__main__':
- make_char_data_file()