Skip to content

Commit 279edd8

Browse files
Replace vmlaq_f32 with vfmaq_f32 (fused version)
1 parent c276398 commit 279edd8

File tree

1 file changed

+27
-27
lines changed

1 file changed

+27
-27
lines changed

src/layer/arm/neon_mathfun.h

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -83,24 +83,24 @@ static inline float32x4_t log_ps(float32x4_t x)
8383
float32x4_t z = vmulq_f32(x, x);
8484

8585
float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
86-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p1), y, x);
87-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p2), y, x);
88-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p3), y, x);
89-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p4), y, x);
90-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p5), y, x);
91-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p6), y, x);
92-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p7), y, x);
93-
y = vmlaq_f32(vdupq_n_f32(c_cephes_log_p8), y, x);
86+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p1), y, x);
87+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p2), y, x);
88+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p3), y, x);
89+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p4), y, x);
90+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p5), y, x);
91+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p6), y, x);
92+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p7), y, x);
93+
y = vfmaq_f32(vdupq_n_f32(c_cephes_log_p8), y, x);
9494
y = vmulq_f32(y, x);
9595

9696
y = vmulq_f32(y, z);
9797

98-
y = vmlaq_f32(y, e, vdupq_n_f32(c_cephes_log_q1));
98+
y = vfmaq_f32(y, e, vdupq_n_f32(c_cephes_log_q1));
9999

100-
y = vmlsq_f32(y, z, vdupq_n_f32(0.5f));
100+
y = vfmsq_f32(y, z, vdupq_n_f32(0.5f));
101101

102102
x = vaddq_f32(x, y);
103-
x = vmlaq_f32(x, e, vdupq_n_f32(c_cephes_log_q2));
103+
x = vfmaq_f32(x, e, vdupq_n_f32(c_cephes_log_q2));
104104
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
105105
return x;
106106
}
@@ -129,7 +129,7 @@ static inline float32x4_t exp_ps(float32x4_t x)
129129
x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
130130

131131
/* express exp(x) as exp(g + n*log(2)) */
132-
fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
132+
fx = vfmaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
133133

134134
/* perform a floorf */
135135
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
@@ -148,13 +148,13 @@ static inline float32x4_t exp_ps(float32x4_t x)
148148
z = vmulq_f32(x, x);
149149

150150
float32x4_t y = vdupq_n_f32(c_cephes_exp_p0);
151-
y = vmlaq_f32(vdupq_n_f32(c_cephes_exp_p1), y, x);
152-
y = vmlaq_f32(vdupq_n_f32(c_cephes_exp_p2), y, x);
153-
y = vmlaq_f32(vdupq_n_f32(c_cephes_exp_p3), y, x);
154-
y = vmlaq_f32(vdupq_n_f32(c_cephes_exp_p4), y, x);
155-
y = vmlaq_f32(vdupq_n_f32(c_cephes_exp_p5), y, x);
151+
y = vfmaq_f32(vdupq_n_f32(c_cephes_exp_p1), y, x);
152+
y = vfmaq_f32(vdupq_n_f32(c_cephes_exp_p2), y, x);
153+
y = vfmaq_f32(vdupq_n_f32(c_cephes_exp_p3), y, x);
154+
y = vfmaq_f32(vdupq_n_f32(c_cephes_exp_p4), y, x);
155+
y = vfmaq_f32(vdupq_n_f32(c_cephes_exp_p5), y, x);
156156

157-
y = vmlaq_f32(x, y, z);
157+
y = vfmaq_f32(x, y, z);
158158
y = vaddq_f32(y, one);
159159

160160
/* build 2^n */
@@ -225,9 +225,9 @@ static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos
225225

226226
/* The magic pass: "Extended precision modular arithmetic"
227227
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
228-
x = vmlaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP1));
229-
x = vmlaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP2));
230-
x = vmlaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP3));
228+
x = vfmaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP1));
229+
x = vfmaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP2));
230+
x = vfmaq_f32(x, y, vdupq_n_f32(c_minus_cephes_DP3));
231231

232232
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
233233
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
@@ -237,15 +237,15 @@ static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos
237237
float32x4_t z = vmulq_f32(x, x);
238238
float32x4_t y1, y2;
239239

240-
y1 = vmlaq_f32(vdupq_n_f32(c_coscof_p1), z, vdupq_n_f32(c_coscof_p0));
241-
y2 = vmlaq_f32(vdupq_n_f32(c_sincof_p1), z, vdupq_n_f32(c_sincof_p0));
242-
y1 = vmlaq_f32(vdupq_n_f32(c_coscof_p2), y1, z);
243-
y2 = vmlaq_f32(vdupq_n_f32(c_sincof_p2), y2, z);
240+
y1 = vfmaq_f32(vdupq_n_f32(c_coscof_p1), z, vdupq_n_f32(c_coscof_p0));
241+
y2 = vfmaq_f32(vdupq_n_f32(c_sincof_p1), z, vdupq_n_f32(c_sincof_p0));
242+
y1 = vfmaq_f32(vdupq_n_f32(c_coscof_p2), y1, z);
243+
y2 = vfmaq_f32(vdupq_n_f32(c_sincof_p2), y2, z);
244244
y1 = vmulq_f32(y1, z);
245245
y2 = vmulq_f32(y2, z);
246246
y1 = vmulq_f32(y1, z);
247-
y1 = vmlsq_f32(y1, z, vdupq_n_f32(0.5f));
248-
y2 = vmlaq_f32(x, y2, x);
247+
y1 = vfmsq_f32(y1, z, vdupq_n_f32(0.5f));
248+
y2 = vfmaq_f32(x, y2, x);
249249
y1 = vaddq_f32(y1, vdupq_n_f32(1));
250250

251251
/* select the correct result from the two polynoms */

0 commit comments

Comments
 (0)