Skip to content

Commit c383cbf

Browse files
committed
Optimizations for Armv8-A
These changes apply only to the AArch64 execution state. They also add arm64 testing with Travis CI.
1 parent ef425b1 commit c383cbf

File tree

2 files changed

+168
-2
lines changed

2 files changed

+168
-2
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
language: c
2+
arch:
3+
- amd64
4+
- arm64
25
compiler:
36
- gcc
47
- clang

picohttpparser.c

Lines changed: 165 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_FEATURE_SVE
38+
#include <arm_sve.h>
39+
#endif
40+
#ifdef __ARM_NEON
41+
#include <arm_neon.h>
42+
#endif
3743
#include "picohttpparser.h"
3844

3945
#if __GNUC__ >= 3
@@ -71,9 +77,8 @@
7177
#define ADVANCE_TOKEN(tok, toklen) \
7278
do { \
7379
const char *tok_start = buf; \
74-
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580
int found2; \
76-
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
81+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7782
if (!found2) { \
7883
CHECK_EOF(); \
7984
} \
@@ -131,6 +136,69 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136
return buf;
132137
}
133138

139+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
140+
{
141+
#ifdef __ARM_FEATURE_SVE
142+
*found = 0;
143+
144+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
145+
const uint64_t len = buf_end - buf;
146+
const svbool_t pg = svwhilelt_b8(i, len);
147+
148+
if (!svptest_first(svptrue_b8(), pg)) {
149+
buf = buf_end;
150+
break;
151+
}
152+
153+
const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
154+
const svbool_t c = svorr_z(pg, svcmplt(pg, v, '\041'), svcmpeq(pg, v, '\177'));
155+
156+
if (svptest_any(pg, c)) {
157+
*found = 1;
158+
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
159+
break;
160+
}
161+
}
162+
163+
return buf;
164+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
165+
*found = 0;
166+
167+
const size_t block_size = sizeof(uint8x16_t) - 1;
168+
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
169+
170+
for (; buf < end; buf += sizeof(uint8x16_t)) {
171+
// This mask makes it possible to pack the comparison result into half a vector,
172+
// which has the same size as uint64_t.
173+
const uint16x8_t mask = vmovq_n_u16(0x0f00);
174+
uint8x16_t v = vld1q_u8((const uint8_t *) buf);
175+
176+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
177+
v = vreinterpretq_u8_u16(vbicq_u16(vreinterpretq_u16_u8(v), mask));
178+
// Pack the comparison result into 64 bits.
179+
v = vpmaxq_u8(v, v);
180+
181+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);
182+
183+
if (offset) {
184+
*found = 1;
185+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
186+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
187+
"Need the number of leading 0-bits in uint64_t.");
188+
// offset uses 4 bits per byte of input.
189+
buf += __builtin_clzll(offset) / 4;
190+
break;
191+
}
192+
}
193+
194+
return buf;
195+
#else
196+
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
197+
198+
return findchar_fast(buf, buf_end, ranges2, 4, found);
199+
#endif
200+
}
201+
134202
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135203
{
136204
const char *token_start = buf;
@@ -143,6 +211,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143211
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144212
if (found)
145213
goto FOUND_CTL;
214+
#elif defined(__ARM_FEATURE_SVE)
215+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
216+
const uint64_t len = buf_end - buf;
217+
const svbool_t pg = svwhilelt_b8(i, len);
218+
219+
if (!svptest_first(svptrue_b8(), pg)) {
220+
buf = buf_end;
221+
break;
222+
}
223+
224+
const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
225+
const uint8_t space = '\040';
226+
227+
if (svptest_any(pg, svcmpge(pg, svsub_x(pg, v, space), 0137u))) {
228+
svbool_t c = svcmpne(svcmplt(pg, v, space), v, '\011');
229+
230+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
231+
232+
if (svptest_any(pg, c)) {
233+
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
234+
goto FOUND_CTL;
235+
}
236+
}
237+
}
238+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
239+
const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
240+
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
241+
242+
for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
243+
const uint8x16_t space = vmovq_n_u8('\040');
244+
const uint8x16_t threshold = vmovq_n_u8(0137u);
245+
const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf);
246+
const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1));
247+
uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold);
248+
uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold);
249+
250+
v3 = vorrq_u8(v3, v4);
251+
// Pack the comparison result into half a vector, i.e. 64 bits.
252+
v3 = vpmaxq_u8(v3, v3);
253+
254+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
255+
const uint8x16_t del = vmovq_n_u8('\177');
256+
// This mask makes it possible to pack the comparison results into half a vector,
257+
// which has the same size as uint64_t.
258+
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
259+
const uint8x16_t tab = vmovq_n_u8('\011');
260+
261+
v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab));
262+
v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab));
263+
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
264+
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
265+
// After masking, four consecutive bytes in the results do not have the same bits set.
266+
v3 = vandq_u8(v3, mask);
267+
v4 = vandq_u8(v4, mask);
268+
// Pack the comparison results into 128, and then 64 bits.
269+
v3 = vpaddq_u8(v3, v4);
270+
v3 = vpaddq_u8(v3, v3);
271+
272+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
273+
274+
if (offset) {
275+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
276+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
277+
"Need the number of leading 0-bits in uint64_t.");
278+
// offset uses 2 bits per byte of input.
279+
buf += __builtin_clzll(offset) / 2;
280+
goto FOUND_CTL;
281+
}
282+
}
283+
}
146284
#else
147285
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148286
while (likely(buf_end - buf >= 8)) {
@@ -258,6 +396,31 @@ static const char *parse_token(const char *buf, const char *buf_end, const char
258396
const char *buf_start = buf;
259397
int found;
260398
buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found);
399+
400+
#ifdef __ARM_FEATURE_SVE
401+
if (!found) {
402+
for (uint64_t i = 0;; i = svqincw(i, 1)) {
403+
const uint64_t len = buf_end - buf;
404+
const svbool_t pg = svwhilelt_b32(i, len);
405+
406+
if (!svptest_first(svptrue_b32(), pg)) {
407+
buf = buf_end;
408+
break;
409+
}
410+
411+
const svuint32_t offsets = svld1ub_u32(pg, (const uint8_t *) buf + i);
412+
const svuint32_t v = svld1ub_gather_offset_u32(pg, (const uint8_t *) token_char_map, offsets);
413+
const svbool_t c = svcmpeq(pg, v, 0);
414+
415+
if (svptest_any(pg, c)) {
416+
found = 1;
417+
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
418+
break;
419+
}
420+
}
421+
}
422+
#endif
423+
261424
if (!found) {
262425
CHECK_EOF();
263426
}

0 commit comments

Comments
 (0)