Skip to content

Commit 7fd167f

Browse files
authored
tanh avx512 mask optimization (#6096)
1 parent 5cd7653 commit 7fd167f

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

src/layer/x86/tanh_x86.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ int TanH_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
4040
float* ptr = bottom_top_blob.channel(q);
4141

4242
int i = 0;
43-
#if __SSE2__
44-
#if __AVX__
4543
#if __AVX512F__
4644
for (; i + 15 < size; i += 16)
4745
{
@@ -50,7 +48,17 @@ int TanH_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
5048
_mm512_storeu_ps(ptr, _p);
5149
ptr += 16;
5250
}
53-
#endif
51+
if (i < size)
52+
{
53+
const unsigned int remain = size - i;
54+
__mmask16 _mask = (__mmask16)((1u << remain) - 1);
55+
__m512 _p = _mm512_maskz_loadu_ps(_mask, ptr);
56+
_p = tanh_avx512(_p);
57+
_mm512_mask_storeu_ps(ptr, _mask, _p);
58+
}
59+
#else // __AVX512F__
60+
#if __SSE2__
61+
#if __AVX__
5462
for (; i + 7 < size; i += 8)
5563
{
5664
__m256 _p = _mm256_loadu_ps(ptr);
@@ -72,6 +80,7 @@ int TanH_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
7280
*ptr = tanhf(*ptr);
7381
ptr++;
7482
}
83+
#endif // __AVX512F__
7584
}
7685

7786
return 0;

0 commit comments

Comments
 (0)