@@ -60,12 +60,30 @@ where
6060 #[ cfg( all( target_feature = "avx2" , not( target_feature = "avx512vbmi" ) ) ) ]
6161 32 => transize ( avx2_pshufb, self , idxs) ,
6262 #[ cfg( all( target_feature = "avx512vl" , target_feature = "avx512vbmi" ) ) ]
63- 32 => transize ( x86:: _mm256_permutexvar_epi8, zeroing_idxs ( idxs) , self ) ,
64- // Notable absence: avx512bw shuffle
65- // If avx512bw is available, odds of avx512vbmi are good
66- // FIXME: initial AVX512VBMI variant didn't actually pass muster
67- // #[cfg(target_feature = "avx512vbmi")]
68- // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
63+ 32 => {
64+ // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
65+ let swizzler = |bytes, idxs| {
66+ let mask = x86:: _mm256_cmp_epu8_mask :: < { x86:: _MM_CMPINT_LT } > (
67+ idxs,
68+ Simd :: < u8 , 32 > :: splat ( N as u8 ) . into ( ) ,
69+ ) ;
70+ x86:: _mm256_maskz_permutexvar_epi8 ( mask, idxs, bytes)
71+ } ;
72+ transize ( swizzler, self , idxs)
73+ }
74+ // Notable absence: avx512bw pshufb shuffle
75+ #[ cfg( all( target_feature = "avx512vl" , target_feature = "avx512vbmi" ) ) ]
76+ 64 => {
77+ // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
78+ let swizzler = |bytes, idxs| {
79+ let mask = x86:: _mm512_cmp_epu8_mask :: < { x86:: _MM_CMPINT_LT } > (
80+ idxs,
81+ Simd :: < u8 , 64 > :: splat ( N as u8 ) . into ( ) ,
82+ ) ;
83+ x86:: _mm512_maskz_permutexvar_epi8 ( mask, idxs, bytes)
84+ } ;
85+ transize ( swizzler, self , idxs)
86+ }
6987 _ => {
7088 let mut array = [ 0 ; N ] ;
7189 for ( i, k) in idxs. to_array ( ) . into_iter ( ) . enumerate ( ) {
0 commit comments