Skip to content

clang loop unroll is too aggressive #165072

@rockeet

Description

@rockeet
char* small_memcpy(char* bdst, const char* bsrc, size_t len) {
    // clang loop unroll 8 on -O2
    //#pragma nounroll // -Os also disable loop unroll
    while (__builtin_expect(len >= 64, 0)) {
        _mm512_storeu_epi8(bdst, _mm512_loadu_epi8(bsrc));
        len  -= 64;
        bsrc += 64;
        bdst += 64;
    }
    auto mask = _bzhi_u64(-1, len);
    auto tail = _mm512_maskz_loadu_epi8(mask, bsrc);
    _mm512_mask_storeu_epi8(bdst, mask, tail);
    return bdst + len;
}

loop unroll is not valuable in this case, but with -O2, clang loop unrolled by 8(see godbolt):

small_memcpy(char*, char const*, unsigned long):
        mov     rax, rdi
        cmp     rdx, 64
        jae     .LBB0_1
.LBB0_6:
        mov     rcx, -1
        bzhi    rcx, rcx, rdx
        kmovq   k1, rcx
        vmovdqu8        zmm0 {k1} {z}, zmmword ptr [rsi]
        vmovdqu8        zmmword ptr [rax] {k1}, zmm0
        add     rax, rdx
        vzeroupper
        ret
.LBB0_1:
        lea     rcx, [rdx - 64]
        mov     edi, ecx
        not     edi
        test    edi, 448
        jne     .LBB0_2
.LBB0_4:
        cmp     rcx, 448
        jb      .LBB0_6
.LBB0_5:
        vmovups zmm0, zmmword ptr [rsi]
        vmovups zmmword ptr [rax], zmm0
        vmovups zmm0, zmmword ptr [rsi + 64]
        vmovups zmmword ptr [rax + 64], zmm0
        vmovups zmm0, zmmword ptr [rsi + 128]
        vmovups zmmword ptr [rax + 128], zmm0
        vmovups zmm0, zmmword ptr [rsi + 192]
        vmovups zmmword ptr [rax + 192], zmm0
        vmovups zmm0, zmmword ptr [rsi + 256]
        vmovups zmmword ptr [rax + 256], zmm0
        vmovups zmm0, zmmword ptr [rsi + 320]
        vmovups zmmword ptr [rax + 320], zmm0
        vmovups zmm0, zmmword ptr [rsi + 384]
        vmovups zmmword ptr [rax + 384], zmm0
        vmovups zmm0, zmmword ptr [rsi + 448]
        vmovups zmmword ptr [rax + 448], zmm0
        add     rdx, -512
        add     rsi, 512
        add     rax, 512
        cmp     rdx, 63
        ja      .LBB0_5
        jmp     .LBB0_6
.LBB0_2:
        mov     edi, ecx
        shr     edi, 6
        inc     edi
        and     edi, 7
        mov     r8d, edi
        shl     r8d, 6
        sub     rdx, r8
.LBB0_3:
        vmovups zmm0, zmmword ptr [rsi]
        vmovups zmmword ptr [rax], zmm0
        add     rsi, 64
        add     rax, 64
        dec     rdi
        jne     .LBB0_3
        jmp     .LBB0_4

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions