@@ -83,24 +83,24 @@ static inline float32x4_t log_ps(float32x4_t x)
83
83
float32x4_t z = vmulq_f32 (x , x );
84
84
85
85
float32x4_t y = vdupq_n_f32 (c_cephes_log_p0 );
86
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p1 ), y , x );
87
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p2 ), y , x );
88
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p3 ), y , x );
89
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p4 ), y , x );
90
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p5 ), y , x );
91
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p6 ), y , x );
92
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p7 ), y , x );
93
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_log_p8 ), y , x );
86
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p1 ), y , x );
87
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p2 ), y , x );
88
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p3 ), y , x );
89
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p4 ), y , x );
90
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p5 ), y , x );
91
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p6 ), y , x );
92
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p7 ), y , x );
93
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_log_p8 ), y , x );
94
94
y = vmulq_f32 (y , x );
95
95
96
96
y = vmulq_f32 (y , z );
97
97
98
- y = vmlaq_f32 (y , e , vdupq_n_f32 (c_cephes_log_q1 ));
98
+ y = vfmaq_f32 (y , e , vdupq_n_f32 (c_cephes_log_q1 ));
99
99
100
- y = vmlsq_f32 (y , z , vdupq_n_f32 (0.5f ));
100
+ y = vfmsq_f32 (y , z , vdupq_n_f32 (0.5f ));
101
101
102
102
x = vaddq_f32 (x , y );
103
- x = vmlaq_f32 (x , e , vdupq_n_f32 (c_cephes_log_q2 ));
103
+ x = vfmaq_f32 (x , e , vdupq_n_f32 (c_cephes_log_q2 ));
104
104
x = vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (x ), invalid_mask )); // negative arg will be NAN
105
105
return x ;
106
106
}
@@ -129,7 +129,7 @@ static inline float32x4_t exp_ps(float32x4_t x)
129
129
x = vmaxq_f32 (x , vdupq_n_f32 (c_exp_lo ));
130
130
131
131
/* express exp(x) as exp(g + n*log(2)) */
132
- fx = vmlaq_f32 (vdupq_n_f32 (0.5f ), x , vdupq_n_f32 (c_cephes_LOG2EF ));
132
+ fx = vfmaq_f32 (vdupq_n_f32 (0.5f ), x , vdupq_n_f32 (c_cephes_LOG2EF ));
133
133
134
134
/* perform a floorf */
135
135
tmp = vcvtq_f32_s32 (vcvtq_s32_f32 (fx ));
@@ -148,13 +148,13 @@ static inline float32x4_t exp_ps(float32x4_t x)
148
148
z = vmulq_f32 (x , x );
149
149
150
150
float32x4_t y = vdupq_n_f32 (c_cephes_exp_p0 );
151
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_exp_p1 ), y , x );
152
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_exp_p2 ), y , x );
153
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_exp_p3 ), y , x );
154
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_exp_p4 ), y , x );
155
- y = vmlaq_f32 (vdupq_n_f32 (c_cephes_exp_p5 ), y , x );
151
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_exp_p1 ), y , x );
152
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_exp_p2 ), y , x );
153
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_exp_p3 ), y , x );
154
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_exp_p4 ), y , x );
155
+ y = vfmaq_f32 (vdupq_n_f32 (c_cephes_exp_p5 ), y , x );
156
156
157
- y = vmlaq_f32 (x , y , z );
157
+ y = vfmaq_f32 (x , y , z );
158
158
y = vaddq_f32 (y , one );
159
159
160
160
/* build 2^n */
@@ -225,9 +225,9 @@ static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos
225
225
226
226
/* The magic pass: "Extended precision modular arithmetic"
227
227
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
228
- x = vmlaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP1 ));
229
- x = vmlaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP2 ));
230
- x = vmlaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP3 ));
228
+ x = vfmaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP1 ));
229
+ x = vfmaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP2 ));
230
+ x = vfmaq_f32 (x , y , vdupq_n_f32 (c_minus_cephes_DP3 ));
231
231
232
232
sign_mask_sin = veorq_u32 (sign_mask_sin , vtstq_u32 (emm2 , vdupq_n_u32 (4 )));
233
233
sign_mask_cos = vtstq_u32 (vsubq_u32 (emm2 , vdupq_n_u32 (2 )), vdupq_n_u32 (4 ));
@@ -237,15 +237,15 @@ static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos
237
237
float32x4_t z = vmulq_f32 (x , x );
238
238
float32x4_t y1 , y2 ;
239
239
240
- y1 = vmlaq_f32 (vdupq_n_f32 (c_coscof_p1 ), z , vdupq_n_f32 (c_coscof_p0 ));
241
- y2 = vmlaq_f32 (vdupq_n_f32 (c_sincof_p1 ), z , vdupq_n_f32 (c_sincof_p0 ));
242
- y1 = vmlaq_f32 (vdupq_n_f32 (c_coscof_p2 ), y1 , z );
243
- y2 = vmlaq_f32 (vdupq_n_f32 (c_sincof_p2 ), y2 , z );
240
+ y1 = vfmaq_f32 (vdupq_n_f32 (c_coscof_p1 ), z , vdupq_n_f32 (c_coscof_p0 ));
241
+ y2 = vfmaq_f32 (vdupq_n_f32 (c_sincof_p1 ), z , vdupq_n_f32 (c_sincof_p0 ));
242
+ y1 = vfmaq_f32 (vdupq_n_f32 (c_coscof_p2 ), y1 , z );
243
+ y2 = vfmaq_f32 (vdupq_n_f32 (c_sincof_p2 ), y2 , z );
244
244
y1 = vmulq_f32 (y1 , z );
245
245
y2 = vmulq_f32 (y2 , z );
246
246
y1 = vmulq_f32 (y1 , z );
247
- y1 = vmlsq_f32 (y1 , z , vdupq_n_f32 (0.5f ));
248
- y2 = vmlaq_f32 (x , y2 , x );
247
+ y1 = vfmsq_f32 (y1 , z , vdupq_n_f32 (0.5f ));
248
+ y2 = vfmaq_f32 (x , y2 , x );
249
249
y1 = vaddq_f32 (y1 , vdupq_n_f32 (1 ));
250
250
251
251
/* select the correct result from the two polynoms */
0 commit comments