3434#include  <x86intrin.h> 
3535#endif 
3636#endif 
37+ #ifdef  __ARM_FEATURE_SVE 
38+ #include  <arm_sve.h> 
39+ #endif 
40+ #ifdef  __ARM_NEON 
41+ #include  <arm_neon.h> 
42+ #endif 
3743#include  "picohttpparser.h" 
3844
3945#if  __GNUC__  >= 3 
7177#define  ADVANCE_TOKEN (tok , toklen )                                                                                                 \
7278    do {                                                                                                                           \
7379        const char *tok_start = buf;                                                                                               \
74-         static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";                                                            \
7580        int found2;                                                                                                                \
76-         buf = findchar_fast (buf, buf_end, ranges2, 4,  &found2);                                                                     \
81+         buf = findchar_nonprintable_fast (buf, buf_end, &found2);                                                                   \
7782        if (!found2) {                                                                                                             \
7883            CHECK_EOF();                                                                                                           \
7984        }                                                                                                                          \
@@ -131,6 +136,69 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136    return  buf ;
132137}
133138
139+ static  const  char  * findchar_nonprintable_fast (const  char  * buf , const  char  * buf_end , int  * found )
140+ {
141+ #ifdef  __ARM_FEATURE_SVE 
142+     * found  =  0 ;
143+ 
144+     for  (uint64_t  i  =  0 ;; i  =  svqincb (i , 1 )) {
145+         const  uint64_t  len  =  buf_end  -  buf ;
146+         const  svbool_t  pg  =  svwhilelt_b8 (i , len );
147+ 
148+         if  (!svptest_first (svptrue_b8 (), pg )) {
149+             buf  =  buf_end ;
150+             break ;
151+         }
152+ 
153+         const  svuint8_t  v  =  svld1 (pg , (const  uint8_t  * ) buf  +  i );
154+         const  svbool_t  c  =  svorr_z (pg , svcmplt (pg , v , '\041' ), svcmpeq (pg , v , '\177' ));
155+ 
156+         if  (svptest_any (pg , c )) {
157+             * found  =  1 ;
158+             buf  +=  i  +  svcntp_b8 (pg , svbrkb_z (pg , c ));
159+             break ;
160+         }
161+     }
162+ 
163+     return  buf ;
164+ #elif  defined(__ARM_NEON ) &&  defined(__ARM_64BIT_STATE )
165+     * found  =  0 ;
166+ 
167+     const  size_t  block_size  =  sizeof (uint8x16_t ) -  1 ;
168+     const  char  *  const  end  =  (size_t ) (buf_end  -  buf ) >= block_size  ? buf_end  -  block_size  : buf ;
169+ 
170+     for  (; buf  <  end ; buf  +=  sizeof (uint8x16_t )) {
171+         // This mask makes it possible to pack the comparison result into half a vector, 
172+         // which has the same size as uint64_t. 
173+         const  uint16x8_t  mask  =  vmovq_n_u16 (0x0f00 );
174+         uint8x16_t  v  =  vld1q_u8 ((const  uint8_t  * ) buf );
175+ 
176+         v  =  vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
177+         v  =  vreinterpretq_u8_u16 (vbicq_u16 (vreinterpretq_u16_u8 (v ), mask ));
178+         // Pack the comparison result into 64 bits. 
179+         v  =  vpmaxq_u8 (v , v );
180+ 
181+         uint64_t  offset  =  vgetq_lane_u64 (vreinterpretq_u64_u8 (v ), 0 );
182+ 
183+         if  (offset ) {
184+             * found  =  1 ;
185+             __asm__ ("rbit %x0, %x0"  : "+r"  (offset ));
186+             static_assert (sizeof (unsigned long long ) ==  sizeof (uint64_t ),
187+                           "Need the number of leading 0-bits in uint64_t." );
188+             // offset uses 4 bits per byte of input. 
189+             buf  +=  __builtin_clzll (offset ) / 4 ;
190+             break ;
191+         }
192+     }
193+ 
194+     return  buf ;
195+ #else 
196+     static  const  char  ALIGNED (16 ) ranges2 [16 ] =  "\000\040\177\177" ;
197+ 
198+     return  findchar_fast (buf , buf_end , ranges2 , 4 , found );
199+ #endif 
200+ }
201+ 
134202static  const  char  * get_token_to_eol (const  char  * buf , const  char  * buf_end , const  char  * * token , size_t  * token_len , int  * ret )
135203{
136204    const  char  * token_start  =  buf ;
@@ -143,6 +211,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143211    buf  =  findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144212    if  (found )
145213        goto FOUND_CTL ;
214+ #elif  defined(__ARM_FEATURE_SVE )
215+     for  (uint64_t  i  =  0 ;; i  =  svqincb (i , 1 )) {
216+         const  uint64_t  len  =  buf_end  -  buf ;
217+         const  svbool_t  pg  =  svwhilelt_b8 (i , len );
218+ 
219+         if  (!svptest_first (svptrue_b8 (), pg )) {
220+             buf  =  buf_end ;
221+             break ;
222+         }
223+ 
224+         const  svuint8_t  v  =  svld1 (pg , (const  uint8_t  * ) buf  +  i );
225+         const  uint8_t  space  =  '\040' ;
226+ 
227+         if  (svptest_any (pg , svcmpge (pg , svsub_x (pg , v , space ), 0137u ))) {
228+             svbool_t  c  =  svcmpne (svcmplt (pg , v , space ), v , '\011' );
229+ 
230+             c  =  svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
231+ 
232+             if  (svptest_any (pg , c )) {
233+                 buf  +=  i  +  svcntp_b8 (pg , svbrkb_z (pg , c ));
234+                 goto FOUND_CTL ;
235+             }
236+         }
237+     }
238+ #elif  defined(__ARM_NEON ) &&  defined(__ARM_64BIT_STATE )
239+     const  size_t  block_size  =  2  *  sizeof (uint8x16_t ) -  1 ;
240+     const  char  *  const  end  =  (size_t ) (buf_end  -  buf ) >= block_size  ? buf_end  -  block_size  : buf ;
241+ 
242+     for  (; buf  <  end ; buf  +=  2  *  sizeof (uint8x16_t )) {
243+         const  uint8x16_t  space  =  vmovq_n_u8 ('\040' );
244+         const  uint8x16_t  threshold  =  vmovq_n_u8 (0137u );
245+         const  uint8x16_t  v1  =  vld1q_u8 ((const  uint8_t  * ) buf );
246+         const  uint8x16_t  v2  =  vld1q_u8 ((const  uint8_t  * ) buf  +  sizeof (v1 ));
247+         uint8x16_t  v3  =  vcgeq_u8 (vsubq_u8 (v1 , space ), threshold );
248+         uint8x16_t  v4  =  vcgeq_u8 (vsubq_u8 (v2 , space ), threshold );
249+ 
250+         v3  =  vorrq_u8 (v3 , v4 );
251+         // Pack the comparison result into half a vector, i.e. 64 bits. 
252+         v3  =  vpmaxq_u8 (v3 , v3 );
253+ 
254+         if  (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
255+             const  uint8x16_t  del  =  vmovq_n_u8 ('\177' );
256+             // This mask makes it possible to pack the comparison results into half a vector, 
257+             // which has the same size as uint64_t. 
258+             const  uint8x16_t  mask  =  vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
259+             const  uint8x16_t  tab  =  vmovq_n_u8 ('\011' );
260+ 
261+             v3  =  vbicq_u8 (vcltq_u8 (v1 , space ), vceqq_u8 (v1 , tab ));
262+             v4  =  vbicq_u8 (vcltq_u8 (v2 , space ), vceqq_u8 (v2 , tab ));
263+             v3  =  vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
264+             v4  =  vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
265+             // After masking, four consecutive bytes in the results do not have the same bits set. 
266+             v3  =  vandq_u8 (v3 , mask );
267+             v4  =  vandq_u8 (v4 , mask );
268+             // Pack the comparison results into 128, and then 64 bits. 
269+             v3  =  vpaddq_u8 (v3 , v4 );
270+             v3  =  vpaddq_u8 (v3 , v3 );
271+ 
272+             uint64_t  offset  =  vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
273+ 
274+             if  (offset ) {
275+                 __asm__ ("rbit %x0, %x0"  : "+r"  (offset ));
276+                 static_assert (sizeof (unsigned long long ) ==  sizeof (uint64_t ),
277+                               "Need the number of leading 0-bits in uint64_t." );
278+                 // offset uses 2 bits per byte of input. 
279+                 buf  +=  __builtin_clzll (offset ) / 2 ;
280+                 goto FOUND_CTL ;
281+             }
282+         }
283+     }
146284#else 
147285    /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */ 
148286    while  (likely (buf_end  -  buf  >= 8 )) {
@@ -258,6 +396,31 @@ static const char *parse_token(const char *buf, const char *buf_end, const char
258396    const  char  * buf_start  =  buf ;
259397    int  found ;
260398    buf  =  findchar_fast (buf , buf_end , ranges , sizeof (ranges ) -  1 , & found );
399+ 
400+ #ifdef  __ARM_FEATURE_SVE 
401+     if  (!found ) {
402+         for  (uint64_t  i  =  0 ;; i  =  svqincw (i , 1 )) {
403+             const  uint64_t  len  =  buf_end  -  buf ;
404+             const  svbool_t  pg  =  svwhilelt_b32 (i , len );
405+ 
406+             if  (!svptest_first (svptrue_b32 (), pg )) {
407+                 buf  =  buf_end ;
408+                 break ;
409+             }
410+ 
411+             const  svuint32_t  offsets  =  svld1ub_u32 (pg , (const  uint8_t  * ) buf  +  i );
412+             const  svuint32_t  v  =  svld1ub_gather_offset_u32 (pg , (const  uint8_t  * ) token_char_map , offsets );
413+             const  svbool_t  c  =  svcmpeq (pg , v , 0 );
414+ 
415+             if  (svptest_any (pg , c )) {
416+                 found  =  1 ;
417+                 buf  +=  i  +  svcntp_b8 (pg , svbrkb_z (pg , c ));
418+                 break ;
419+             }
420+         }
421+     }
422+ #endif 
423+ 
261424    if  (!found ) {
262425        CHECK_EOF ();
263426    }
0 commit comments