-
Notifications
You must be signed in to change notification settings - Fork 15k
Open
Labels
Description
char* small_memcpy(char* bdst, const char* bsrc, size_t len) {
// clang loop unroll 8 on -O2
//#pragma nounroll // -Os also disable loop unroll
while (__builtin_expect(len >= 64, 0)) {
_mm512_storeu_epi8(bdst, _mm512_loadu_epi8(bsrc));
len -= 64;
bsrc += 64;
bdst += 64;
}
auto mask = _bzhi_u64(-1, len);
auto tail = _mm512_maskz_loadu_epi8(mask, bsrc);
_mm512_mask_storeu_epi8(bdst, mask, tail);
return bdst + len;
}loop unroll is not valuable in this case, but with -O2, clang loop unrolled by 8(see godbolt):
small_memcpy(char*, char const*, unsigned long):
mov rax, rdi
cmp rdx, 64
jae .LBB0_1
.LBB0_6:
mov rcx, -1
bzhi rcx, rcx, rdx
kmovq k1, rcx
vmovdqu8 zmm0 {k1} {z}, zmmword ptr [rsi]
vmovdqu8 zmmword ptr [rax] {k1}, zmm0
add rax, rdx
vzeroupper
ret
.LBB0_1:
lea rcx, [rdx - 64]
mov edi, ecx
not edi
test edi, 448
jne .LBB0_2
.LBB0_4:
cmp rcx, 448
jb .LBB0_6
.LBB0_5:
vmovups zmm0, zmmword ptr [rsi]
vmovups zmmword ptr [rax], zmm0
vmovups zmm0, zmmword ptr [rsi + 64]
vmovups zmmword ptr [rax + 64], zmm0
vmovups zmm0, zmmword ptr [rsi + 128]
vmovups zmmword ptr [rax + 128], zmm0
vmovups zmm0, zmmword ptr [rsi + 192]
vmovups zmmword ptr [rax + 192], zmm0
vmovups zmm0, zmmword ptr [rsi + 256]
vmovups zmmword ptr [rax + 256], zmm0
vmovups zmm0, zmmword ptr [rsi + 320]
vmovups zmmword ptr [rax + 320], zmm0
vmovups zmm0, zmmword ptr [rsi + 384]
vmovups zmmword ptr [rax + 384], zmm0
vmovups zmm0, zmmword ptr [rsi + 448]
vmovups zmmword ptr [rax + 448], zmm0
add rdx, -512
add rsi, 512
add rax, 512
cmp rdx, 63
ja .LBB0_5
jmp .LBB0_6
.LBB0_2:
mov edi, ecx
shr edi, 6
inc edi
and edi, 7
mov r8d, edi
shl r8d, 6
sub rdx, r8
.LBB0_3:
vmovups zmm0, zmmword ptr [rsi]
vmovups zmmword ptr [rax], zmm0
add rsi, 64
add rax, 64
dec rdi
jne .LBB0_3
jmp .LBB0_4