1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
|
.global _rsqrt_inaccurate
.global rsqrt_inaccurate
.global _rsqrt_full
.global rsqrt_full
.global _rsqrt_full_gpr
.global rsqrt_full_gpr
.global _rsqrt_full_nb
.global rsqrt_full_nb
.global _rsqrt_full_nb2
.global rsqrt_full_nb2
.global _rsqrt_full_nb_gpr
.global rsqrt_full_nb_gpr
.global _rsqrt_newton
.global rsqrt_newton
.global _rsqrt_hack
.global rsqrt_hack
.global _rsqrt_fallback
.text
.intel_syntax noprefix
.align 16
min_pos_denorm:
.long 0x00800000,0,0,0
penultimate_bit:
.long 0x00008000,0,0,0
ultimate_bit:
.long 0x00004000,0,0,0
top_mask:
.long 0xFFFF8000,0,0,0
one:
.long 0x3f800000,0,0,0
half:
.long 0x3f000000,0,0,0
one_point_five:
.long 0x3fc00000,0,0,0
magic1:
.long 0x60000000,0,0,0
magic2:
.long 0x3c000000,0,0,0
magic3:
.long 0x000047ff,0,0,0
_rsqrt_inaccurate:
rsqrt_inaccurate:
movd xmm0, edi
rsqrtss xmm0, xmm0
movd eax, xmm0
ret
_rsqrt_full:
rsqrt_full:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
_rsqrt_full_gpr:
rsqrt_full_gpr:
movd eax, xmm0 # Emulate regalloc mov
mov eax, edi
and eax, 0xFFFF8000
or eax, 0x00008000
movd xmm0, eax
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
movd eax, xmm1
add eax, 0x00004000
and eax, 0xffff8000
movd xmm0, eax # Emulate regalloc mov
ret
_rsqrt_full_nb2:
rsqrt_full_nb2:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
ucomiss xmm0, [rip + min_pos_denorm]
jna rsqrt_full_bad_new1
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
_rsqrt_full_nb:
rsqrt_full_nb:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad_new1
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
rsqrt_full_bad_new1:
cmp edi, 0x00800000
jb rsqrt_full_bad_new_fallback1
movd xmm0, edi
rsqrtss xmm1, xmm0
ucomiss xmm1, xmm1
jp rsqrt_full_bad_new1_nan
movd eax, xmm1
ret
rsqrt_full_bad_new_fallback1:
call _rsqrt_fallback
ret
rsqrt_full_bad_new1_nan:
ucomiss xmm0, xmm0
jp rsqrt_full_bad_new1_nan_ret
mov eax, 0x7FC00000
ret
rsqrt_full_bad_new1_nan_ret:
ret
_rsqrt_full_nb_gpr:
rsqrt_full_nb_gpr:
movd eax, xmm0 # Emulate regalloc mov
mov eax, edi
and eax, 0xFFFF8000
or eax, 0x00008000
movd xmm0, eax
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad_new2
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
movd eax, xmm1
add eax, 0x00004000
and eax, 0xffff8000
movd xmm0, eax # Emulate regalloc mov
ret
rsqrt_full_bad_new2:
cmp edi, 0x00800000
jb rsqrt_full_bad_new_fallback2
movd xmm0, edi
rsqrtss xmm1, xmm0
test edi, edi
js rsqrt_full_bad_new2_nan
movd eax, xmm1
ret
rsqrt_full_bad_new_fallback2:
call _rsqrt_fallback
ret
rsqrt_full_bad_new2_nan:
mov eax, 0x7FC00000
ret
rsqrt_full_bad:
xorps xmm1, xmm1
movd xmm0, edi
ucomiss xmm0, xmm1
jp rsqrt_full_nan
je rsqrt_full_zero
jc rsqrt_full_neg
cmp edi, 0x7F800000
je rsqrt_full_inf
# TODO: Full Denormal Implementation
call _rsqrt_fallback
ret
rsqrt_full_neg:
mov eax, 0x7FC00000
ret
rsqrt_full_inf:
xor eax, eax
ret
rsqrt_full_nan:
mov eax, edi
or eax, 0x00400000
ret
rsqrt_full_zero:
mov eax, edi
or eax, 0x7F800000
ret
_rsqrt_newton:
rsqrt_newton:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
rsqrtps xmm1, xmm0
mulss xmm0, [rip + half]
vmulss xmm2, xmm1, xmm1
mulss xmm2, xmm0
movaps xmm0, [rip + one_point_five]
subss xmm0, xmm2
mulss xmm0, xmm1
paddd xmm0, [rip + ultimate_bit]
pand xmm0, [rip + top_mask]
movd eax, xmm0
ret
_rsqrt_hack:
rsqrt_hack:
movd xmm9, edi
vpand xmm0, xmm9, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
# detect NaNs, negatives, zeros, denormals and infinities
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
# calculate x64 estimate
rsqrtps xmm0, xmm0
# calculate correction factor
vpslld xmm1, xmm9, 8
vpsrad xmm2, xmm1, 31
paddd xmm1, [rip + magic1]
pcmpgtd xmm1, [rip + magic2]
pxor xmm1, xmm2
movaps xmm2, [rip + magic3]
psubd xmm2, xmm1
# correct x64 estimate
paddd xmm0, xmm2
pand xmm0, [rip + top_mask]
movd eax, xmm0
ret
|