1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
"""Case-folding and whitespace normalization"""
# Unicode Case Folding table has been derived from the following work:
#
# CaseFolding-12.0.0.txt
# Date: 2019-01-22, 08:18:22 GMT
# (c) 2019 Unicode(R) Inc.
# Unicode and the Unicode Logo are registered trademarks
# of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
import re
import sys
from builtins import str, chr
__all__ = ["normalize_reference"]
if sys.version_info < (3,) and sys.maxunicode <= 0xffff:
# shim for Python 2.x UCS2 build
_unichr = chr
def chr(cdp):
if 0x10000 <= cdp < 0x110000:
cdp -= 0x10000
return (_unichr(0xd800 | (cdp >> 10)) +
_unichr(0xdc00 | (cdp & 0x3ff)))
return _unichr(cdp)
def _parse_table(tbl):
xlat = {}
cur_i, cur_j = -1, 0
for entry in tbl.split(';'):
arr = entry.split(',')
info = [int(x, 36) if x else 0 for x in arr[0].split(':')]
arr = [int(x, 36) for x in arr[1:]]
assert not any(x in xlat for x in arr)
sfx = ''.join(map(chr, arr))
streak, stride = 0, 1
if len(info) == 2:
fdt, delta = info
elif len(info) == 3:
fdt, streak, delta = info
else:
fdt, streak, delta, stride = info
assert streak >= 0 and stride >= 1
cur_i += fdt + 1
cur_j -= delta
assert cur_j != 0
i = cur_i
last = cur_i + streak
while i <= last:
# uniqueness and idempotency
assert i not in xlat and i + cur_j not in xlat
assert i not in arr
xlat[i] = chr(i + cur_j) + sfx
i += stride
return xlat
XLAT = _parse_table(
# ===== Start of Unicode Case Folding table =====
'1t:p:-w;37:-kn;a:m:kn;n:6:;6:3w,37;w:1a:-31:2;1b:5k,lj;1:4:-5k:2;6:e::'
'2;f:-aa,32;:18:aa:2;19:3e;:4:-3e:2;5:7h;1:-da;:2:5t:2;3:-5p;:5p;1:1:-5'
'o;1:5o;2:-26;:-3f;:-1;:5m;1:-5o;:-2;1:-4;:2;:5s;3:-5u;:-2;1:-1;:4:5x:2'
';5:-61;:61;1:-61;2:61;1:-61;:61;1:1:-60;1:2:60:2;3:-62;:4:62:4;b:-1;:1'
';1:-1;:1;1:-1;:g:1:2;i:g::2;h:av,lo;:-aw;:2:1:2;3:2q;:-15;:12:-1l:2;13'
':3n;1:g:-3n:2;n:-8bu;:8bu;1:4k;:-8gb;2:8br;1:5g;:-7c;:-2;:8:1y:2;72:-3'
'7;16:2:37:2;5:;8:-37;6:26;1:2:1;3:-r;1:1:1;1:m,lk,ld;:g:9;h:8:;c:b,lk,'
'ld;h:k;c:-7;:12;:-5;3:-a;:7;1:m:-n:2;n:1j;:-6;2:c;:4;1:-1t;1:8;:-8;2:2'
':3n;2:f:-5u;f:v:1c;27:w:v:2;15:1g::2;1h:-e;:c:e:2;e:2m::2;2o:11:-1b;2d'
':2a,136;26w:11:-5mq;12:6::6;mo:5:5m0;1on:4sm;:-1;:-9;:1:-2;1:1;:-7;:-o'
';:-vzb;7:16:tj7;18:2:;8y:44:-2bl:2;45:5yn,mp;:-b,lk;:-2,lm;:-1,lm;:p,j'
'i;:-5xb;2:5wx,37;1:2m:-5yk:2;2v:7:9;f:5:;f:7:;f:7:;f:5:;7:5fn,lv;1:2,l'
'v,lc;1:2,lv,ld;1:2,lv,n6;2:6:-5ft:2;e:7:;n:7:3c,qh;7:7:8,qh;7:7:-o,qh;'
'7:7:8,qh;7:7:-1k,qh;7:7:8,qh;9:-6,qh;:5hc,qh;:6,qh;1:-3,n6;:1,n6,qh;:1'
':-5j2;1:1:1u;1:5hd,qh;1:-6;3:-5h3,qh;:5ha,qh;:a,qh;1:-7,n6;:1,n6,qh;:3'
':-5h6;3:5hb,qh;5:4,lk,lc;:1,lk,ld;2:3,n6;:1,lk,n6;:1:-5jq;1:1:2k;7:5h5'
',lk,lc;:1,lk,ld;:5,lv;1:-2,n6;:1,lk,n6;:1:-5ju;1:1:2w;1:-2x;5:33,qh;:5'
'h0,qh;:-4,qh;1:7,n6;:1,n6,qh;:1:-5gu;1:1:-2;1:5h1,qh;89:8a;3:o2;:-3d;6'
':-6ea;19:f:c;y:f;mq:p:-p;1ft:1a:-m;2n:1b;1:8ag;:-5ch;:5c1;2:4:-8a0:2;5'
':8bh;:-v;:y;:-1;1:3:-8bj:3;b:1:8cg;1:2q:-8cg:2;2y:2::2;6:nym::nym;nyn:'
'16::2;1p:q::2;4h:c::2;f:1o::2;1y:2::2;3:r9h;:8:-r9h:2;c:;1:wmh;2:2:-wm'
'h:2;5:i::2;j:wn9;:b;:-4;:-a;:3;1:-1e;:o;:-l;:-xbp;:a:pr:2;d:;1:1d;:wlv'
';:-5cb;q1:27:2oo;fpr:jii,2u;:1,2x;:1,30;:1,2u,2x;:1,2u,30;:-c,38;:1,38'
';c:-z8,12u;:1,12d;:1,12j;:-9,12u;:b,12l;sp:p:-1cjn;ym:13:-8;4v:z:;1jj:'
'1e:-o;2e7:v:w;gwv:v:;o8v:x:-2'
# ===== End of Unicode Case Folding table =====
)
def _check_native(tbl):
"""
Determine if Python's own native implementation
subsumes the supplied case folding table
"""
try:
for i in tbl:
stv = chr(i)
if stv.casefold() == stv:
return False
except AttributeError:
return False
return True
# Hoist version check out of function for performance
SPACE_RE = re.compile(r'[ \t\r\n]+')
if _check_native(XLAT):
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SPACE_RE.sub(' ', string[1:-1].strip()).casefold()
elif sys.version_info >= (3,) or sys.maxunicode > 0xffff:
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SPACE_RE.sub(' ', string[1:-1].strip()).translate(XLAT)
else:
def _get_smp_regex():
xls = sorted(x - 0x10000 for x in XLAT if x >= 0x10000)
xls.append(-1)
fmt, (dsh, opn, pip, cse) = str('\\u%04x'), str('-[|]')
rga, srk, erk = [str(r'[ \t\r\n]+')], 0, -2
for k in xls:
new_hir = (erk ^ k) >> 10 != 0
if new_hir or erk + 1 != k:
if erk >= 0 and srk != erk:
if srk + 1 != erk:
rga.append(dsh)
rga.append(fmt % (0xdc00 + (erk & 0x3ff)))
if new_hir:
if erk >= 0:
rga.append(cse)
if k < 0:
break
rga.append(pip)
rga.append(fmt % (0xd800 + (k >> 10)))
rga.append(opn)
srk = k
rga.append(fmt % (0xdc00 + (srk & 0x3ff)))
erk = k
return re.compile(str().join(rga))
def _subst_handler(matchobj):
src = matchobj.group(0)
hiv = ord(src[0])
if hiv < 0xd800:
return ' '
return XLAT[0x10000 + ((hiv & 0x3ff) << 10) | (ord(src[1]) & 0x3ff)]
SMP_RE = _get_smp_regex()
def normalize_reference(string):
"""
Normalize reference label: collapse internal whitespace
to single space, remove leading/trailing whitespace, case fold.
"""
return SMP_RE.sub(_subst_handler, string[1:-1].strip()).translate(XLAT)
|