aboutsummaryrefslogtreecommitdiffhomepage
path: root/tests/rsqrt_test_fn.s
blob: fa8dadee01d9c3fb44fbef183dc5ffb456412fd6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
.global _rsqrt_inaccurate
.global rsqrt_inaccurate
.global _rsqrt_full
.global rsqrt_full
.global _rsqrt_full_gpr
.global rsqrt_full_gpr
.global _rsqrt_full_nb
.global rsqrt_full_nb
.global _rsqrt_full_nb2
.global rsqrt_full_nb2
.global _rsqrt_full_nb_gpr
.global rsqrt_full_nb_gpr
.global _rsqrt_newton
.global rsqrt_newton
.global _rsqrt_hack
.global rsqrt_hack
.global _rsqrt_fallback

.text
.intel_syntax noprefix

.align 16
min_pos_denorm:
.long 0x00800000,0,0,0
penultimate_bit:
.long 0x00008000,0,0,0
ultimate_bit:
.long 0x00004000,0,0,0
top_mask:
.long 0xFFFF8000,0,0,0
one:
.long 0x3f800000,0,0,0
half:
.long 0x3f000000,0,0,0
one_point_five:
.long 0x3fc00000,0,0,0
magic1:
.long 0x60000000,0,0,0
magic2:
.long 0x3c000000,0,0,0
magic3:
.long 0x000047ff,0,0,0

_rsqrt_inaccurate:
rsqrt_inaccurate:
    movd xmm0, edi

    rsqrtss xmm0, xmm0

    movd eax, xmm0
    ret

_rsqrt_full:
rsqrt_full:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

_rsqrt_full_gpr:
rsqrt_full_gpr:
    movd eax, xmm0 # Emulate regalloc mov

    mov eax, edi
    and eax, 0xFFFF8000
    or eax, 0x00008000

    movd xmm0, eax
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0
    movd eax, xmm1

    add eax, 0x00004000
    and eax, 0xffff8000

    movd xmm0, eax # Emulate regalloc mov
    ret

_rsqrt_full_nb2:
rsqrt_full_nb2:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    ucomiss xmm0, [rip + min_pos_denorm]
    jna rsqrt_full_bad_new1

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

_rsqrt_full_nb:
rsqrt_full_nb:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad_new1

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0

    paddd xmm1, [rip + ultimate_bit]
    pand xmm1, [rip + top_mask]

    movd eax, xmm1
    ret

rsqrt_full_bad_new1:
    cmp edi, 0x00800000
    jb rsqrt_full_bad_new_fallback1

    movd xmm0, edi
    rsqrtss xmm1, xmm0

    ucomiss xmm1, xmm1
    jp rsqrt_full_bad_new1_nan

    movd eax, xmm1
    ret

rsqrt_full_bad_new_fallback1:
    call _rsqrt_fallback
    ret

rsqrt_full_bad_new1_nan:
    ucomiss xmm0, xmm0
    jp rsqrt_full_bad_new1_nan_ret

    mov eax, 0x7FC00000
    ret

rsqrt_full_bad_new1_nan_ret:
    ret

_rsqrt_full_nb_gpr:
rsqrt_full_nb_gpr:
    movd eax, xmm0 # Emulate regalloc mov

    mov eax, edi
    and eax, 0xFFFF8000
    or eax, 0x00008000

    movd xmm0, eax
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad_new2

    sqrtss xmm0, xmm0

    movd xmm1, [rip + one]
    divss xmm1, xmm0
    movd eax, xmm1

    add eax, 0x00004000
    and eax, 0xffff8000

    movd xmm0, eax # Emulate regalloc mov
    ret

rsqrt_full_bad_new2:
    cmp edi, 0x00800000
    jb rsqrt_full_bad_new_fallback2

    movd xmm0, edi
    rsqrtss xmm1, xmm0

    test edi, edi
    js rsqrt_full_bad_new2_nan

    movd eax, xmm1
    ret

rsqrt_full_bad_new_fallback2:
    call _rsqrt_fallback
    ret

rsqrt_full_bad_new2_nan:
    mov eax, 0x7FC00000
    ret

rsqrt_full_bad:
    xorps xmm1, xmm1
    movd xmm0, edi
    ucomiss xmm0, xmm1
    jp rsqrt_full_nan
    je rsqrt_full_zero
    jc rsqrt_full_neg

    cmp edi, 0x7F800000
    je rsqrt_full_inf

    # TODO: Full Denormal Implementation
    call _rsqrt_fallback
    ret

rsqrt_full_neg:
    mov eax, 0x7FC00000
    ret

rsqrt_full_inf:
    xor eax, eax
    ret

rsqrt_full_nan:
    mov eax, edi
    or eax, 0x00400000
    ret

rsqrt_full_zero:
    mov eax, edi
    or eax, 0x7F800000
    ret

_rsqrt_newton:
rsqrt_newton:
    movd xmm0, edi

    pand xmm0, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    rsqrtps xmm1, xmm0
    mulss xmm0, [rip + half]
    vmulss xmm2, xmm1, xmm1
    mulss xmm2, xmm0
    movaps xmm0, [rip + one_point_five]
    subss xmm0, xmm2
    mulss xmm0, xmm1

    paddd xmm0, [rip + ultimate_bit]
    pand xmm0, [rip + top_mask]

    movd eax, xmm0
    ret

_rsqrt_hack:
rsqrt_hack:
    movd xmm9, edi

    vpand xmm0, xmm9, [rip + top_mask]
    por xmm0, [rip + penultimate_bit]

    # detect NaNs, negatives, zeros, denormals and infinities
    vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
    ptest xmm1, xmm1
    jnz rsqrt_full_bad

    # calculate x64 estimate
    rsqrtps xmm0, xmm0

    # calculate correction factor
    vpslld xmm1, xmm9, 8
    vpsrad xmm2, xmm1, 31
    paddd xmm1, [rip + magic1]
    pcmpgtd xmm1, [rip + magic2]
    pxor xmm1, xmm2
    movaps xmm2, [rip + magic3]
    psubd xmm2, xmm1

    # correct x64 estimate
    paddd xmm0, xmm2
    pand xmm0, [rip + top_mask]

    movd eax, xmm0
    ret