Skip to content
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
This repository was archived by the owner on Dec 1, 2021. It is now read-only.

Took forever to generate the goasm #30

@ii64

Description

@ii64

Hi i am testing the c2goasm, for a simple function it works, but for bigger one it is took like forever, i wonder if there was a problem in my config or a bug?

image
image

build command: clang -S -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti $1

Generated CLang ASM
	.text
	.intel_syntax noprefix
	.file	"encoder.c"
	.globl	qoi_write_32            # -- Begin function qoi_write_32
	.p2align	4, 0x90
	.type	qoi_write_32,@function
qoi_write_32:                           # @qoi_write_32
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	mov	eax, edx
	shr	eax, 24
	movsxd	r8, dword ptr [rsi]
	lea	ecx, [r8 + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + r8], al
	mov	eax, edx
	shr	eax, 16
	movsxd	r8, dword ptr [rsi]
	lea	ecx, [r8 + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + r8], al
	movsxd	rax, dword ptr [rsi]
	lea	ecx, [rax + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + rax], dh
	movsxd	rax, dword ptr [rsi]
	lea	ecx, [rax + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + rax], dl
	mov	rsp, rbp
	pop	rbp
	ret
.Lfunc_end0:
	.size	qoi_write_32, .Lfunc_end0-qoi_write_32
                                        # -- End function
	.globl	qoi_read_32             # -- Begin function qoi_read_32
	.p2align	4, 0x90
	.type	qoi_read_32,@function
qoi_read_32:                            # @qoi_read_32
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	movsxd	rcx, dword ptr [rsi]
	lea	rax, [rcx + 1]
	mov	dword ptr [rsi], eax
	movzx	r8d, byte ptr [rdi + rcx]
	lea	rax, [rcx + 2]
	mov	dword ptr [rsi], eax
	movzx	r9d, byte ptr [rdi + rcx + 1]
	lea	rax, [rcx + 3]
	mov	dword ptr [rsi], eax
	movzx	eax, byte ptr [rdi + rcx + 2]
	lea	edx, [rcx + 4]
	mov	dword ptr [rsi], edx
	movzx	ecx, byte ptr [rdi + rcx + 3]
	shl	r8d, 24
	shl	r9d, 16
	or	r9d, r8d
	shl	eax, 8
	or	eax, r9d
	or	eax, ecx
	mov	rsp, rbp
	pop	rbp
	ret
.Lfunc_end1:
	.size	qoi_read_32, .Lfunc_end1-qoi_read_32
                                        # -- End function
	.globl	pixel_cpy               # -- Begin function pixel_cpy
	.p2align	4, 0x90
	.type	pixel_cpy,@function
pixel_cpy:                              # @pixel_cpy
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	rdx, rdx
	je	.LBB2_20
# %bb.1:
	cmp	rdx, 32
	jb	.LBB2_13
# %bb.2:
	lea	rax, [rsi + rdx]
	cmp	rax, rdi
	jbe	.LBB2_4
# %bb.3:
	lea	rax, [rdi + rdx]
	cmp	rax, rsi
	jbe	.LBB2_4
.LBB2_13:
	lea	r8, [rdx - 1]
	mov	r9, rdx
	and	r9, 7
	je	.LBB2_17
.LBB2_14:
	xor	ecx, ecx
	.p2align	4, 0x90
.LBB2_15:                               # =>This Inner Loop Header: Depth=1
	movzx	eax, byte ptr [rsi + rcx]
	mov	byte ptr [rdi + rcx], al
	add	rcx, 1
	cmp	r9, rcx
	jne	.LBB2_15
# %bb.16:
	sub	rdx, rcx
	add	rsi, rcx
	add	rdi, rcx
.LBB2_17:
	cmp	r8, 7
	jb	.LBB2_20
# %bb.18:
	xor	eax, eax
	.p2align	4, 0x90
.LBB2_19:                               # =>This Inner Loop Header: Depth=1
	movzx	ecx, byte ptr [rsi + rax]
	mov	byte ptr [rdi + rax], cl
	movzx	ecx, byte ptr [rsi + rax + 1]
	mov	byte ptr [rdi + rax + 1], cl
	movzx	ecx, byte ptr [rsi + rax + 2]
	mov	byte ptr [rdi + rax + 2], cl
	movzx	ecx, byte ptr [rsi + rax + 3]
	mov	byte ptr [rdi + rax + 3], cl
	movzx	ecx, byte ptr [rsi + rax + 4]
	mov	byte ptr [rdi + rax + 4], cl
	movzx	ecx, byte ptr [rsi + rax + 5]
	mov	byte ptr [rdi + rax + 5], cl
	movzx	ecx, byte ptr [rsi + rax + 6]
	mov	byte ptr [rdi + rax + 6], cl
	movzx	ecx, byte ptr [rsi + rax + 7]
	mov	byte ptr [rdi + rax + 7], cl
	add	rax, 8
	cmp	rdx, rax
	jne	.LBB2_19
	jmp	.LBB2_20
.LBB2_4:
	mov	r8, rdx
	and	r8, -32
	lea	rax, [r8 - 32]
	mov	rcx, rax
	shr	rcx, 5
	add	rcx, 1
	mov	r9d, ecx
	and	r9d, 3
	cmp	rax, 96
	jae	.LBB2_6
# %bb.5:
	xor	eax, eax
	jmp	.LBB2_8
.LBB2_6:
	sub	rcx, r9
	xor	eax, eax
	.p2align	4, 0x90
.LBB2_7:                                # =>This Inner Loop Header: Depth=1
	movups	xmm0, xmmword ptr [rsi + rax]
	movups	xmm1, xmmword ptr [rsi + rax + 16]
	movups	xmmword ptr [rdi + rax], xmm0
	movups	xmmword ptr [rdi + rax + 16], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 32]
	movups	xmm1, xmmword ptr [rsi + rax + 48]
	movups	xmmword ptr [rdi + rax + 32], xmm0
	movups	xmmword ptr [rdi + rax + 48], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 64]
	movups	xmm1, xmmword ptr [rsi + rax + 80]
	movups	xmmword ptr [rdi + rax + 64], xmm0
	movups	xmmword ptr [rdi + rax + 80], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 96]
	movups	xmm1, xmmword ptr [rsi + rax + 112]
	movups	xmmword ptr [rdi + rax + 96], xmm0
	movups	xmmword ptr [rdi + rax + 112], xmm1
	sub	rax, -128
	add	rcx, -4
	jne	.LBB2_7
.LBB2_8:
	test	r9, r9
	je	.LBB2_11
# %bb.9:
	add	rax, 16
	neg	r9
	.p2align	4, 0x90
.LBB2_10:                               # =>This Inner Loop Header: Depth=1
	movups	xmm0, xmmword ptr [rsi + rax - 16]
	movups	xmm1, xmmword ptr [rsi + rax]
	movups	xmmword ptr [rdi + rax - 16], xmm0
	movups	xmmword ptr [rdi + rax], xmm1
	add	rax, 32
	inc	r9
	jne	.LBB2_10
.LBB2_11:
	cmp	r8, rdx
	jne	.LBB2_12
.LBB2_20:
	mov	eax, 1
	mov	rsp, rbp
	pop	rbp
	ret
.LBB2_12:
	and	edx, 31
	add	rsi, r8
	add	rdi, r8
	lea	r8, [rdx - 1]
	mov	r9, rdx
	and	r9, 7
	jne	.LBB2_14
	jmp	.LBB2_17
.Lfunc_end2:
	.size	pixel_cpy, .Lfunc_end2-pixel_cpy
                                        # -- End function
	.globl	qoi_pixel_encoder       # -- Begin function qoi_pixel_encoder
	.p2align	4, 0x90
	.type	qoi_pixel_encoder,@function
qoi_pixel_encoder:                      # @qoi_pixel_encoder
# %bb.0:
	push	rbp
	mov	rbp, rsp
	push	r15
	push	r14
	push	r13
	push	r12
	push	rbx
	and	rsp, -8
	mov	ebx, dword ptr [rbp + 16]
	mov	r10, qword ptr [rbp + 24]
	mov	r11, qword ptr [rbp + 32]
	xor	eax, eax
	cmp	r11, r10
	sete	al
	add	r9d, -1
	xor	r9d, ecx
	add	ebx, -1
	xor	ebx, r8d
	or	ebx, r9d
	sete	cl
	mov	r9b, byte ptr [r11]
	mov	r13b, byte ptr [r11 + 1]
	mov	r14b, byte ptr [r11 + 2]
	mov	r8b, byte ptr [r11 + 3]
	mov	ebx, dword ptr [rdx + 4*rax]
	cmp	r11, r10
	jne	.LBB3_3
# %bb.1:
	test	cl, cl
	jne	.LBB3_3
# %bb.2:
	cmp	ebx, 8224
	jne	.LBB3_7
.LBB3_3:
	shl	rax, 2
	cmp	ebx, 32
	jg	.LBB3_5
# %bb.4:
	add	bl, -1
	or	bl, 64
	mov	r12d, 1
	mov	r15, rsi
	jmp	.LBB3_6
.LBB3_5:
	add	ebx, -33
	mov	dword ptr [rdx + rax], ebx
	shr	ebx, 8
	or	bl, 96
	lea	r15, [rsi + 4]
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], bl
	mov	bl, byte ptr [rdx + rax]
	mov	r12d, 2
.LBB3_6:
	lea	rsi, [rsi + 4*r12]
	movsxd	rcx, dword ptr [r15]
	mov	byte ptr [rdi + rcx], bl
	mov	dword ptr [rdx + rax], 0
.LBB3_7:
	cmp	r11, r10
	je	.LBB3_25
# %bb.8:
	mov	rax, qword ptr [rbp + 40]
	xor	r13b, r9b
	xor	r13b, r14b
	xor	r13b, r8b
	movzx	ecx, r13b
	shl	rcx, 5
	mov	rax, qword ptr [rax + rcx]
	cmp	rax, r11
	je	.LBB3_26
# %bb.9:
	mov	cl, byte ptr [r11]
	mov	byte ptr [rax], cl
	mov	cl, byte ptr [r11 + 1]
	mov	byte ptr [rax + 1], cl
	mov	cl, byte ptr [r11 + 2]
	mov	byte ptr [rax + 2], cl
	mov	cl, byte ptr [r11 + 3]
	mov	byte ptr [rax + 3], cl
	movsx	ecx, byte ptr [r11]
	movsx	eax, byte ptr [r10]
	sub	ecx, eax
	movsx	r9d, byte ptr [r11 + 1]
	movsx	eax, byte ptr [r10 + 1]
	sub	r9d, eax
	movsx	edx, byte ptr [r11 + 2]
	movsx	eax, byte ptr [r10 + 2]
	sub	edx, eax
	movsx	r8d, byte ptr [r11 + 3]
	movsx	eax, byte ptr [r10 + 3]
	sub	r8d, eax
	lea	r15d, [rcx + 16]
	lea	eax, [r9 + 16]
	or	eax, r15d
	lea	r14d, [rdx + 16]
	lea	r10d, [r8 + 16]
	mov	ebx, r14d
	or	ebx, r10d
	or	ebx, eax
	cmp	ebx, 32
	jae	.LBB3_16
# %bb.10:
	lea	r11d, [rdx + 2]
	cmp	r11d, 3
	ja	.LBB3_13
# %bb.11:
	lea	eax, [rcx + 2]
	lea	ebx, [r9 + 2]
	or	ebx, eax
	and	ebx, -4
	or	ebx, r8d
	jne	.LBB3_13
# %bb.12:
	shl	ecx, 4
	add	ecx, 32
	lea	eax, [4*r9 + 8]
	or	eax, ecx
	or	eax, r11d
	or	al, -128
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	jmp	.LBB3_25
.LBB3_26:
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], r13b
	jmp	.LBB3_25
.LBB3_16:
	test	ecx, ecx
	setne	al
	shl	al, 3
	test	r9d, r9d
	setne	bl
	shl	bl, 2
	or	bl, al
	test	edx, edx
	setne	al
	add	al, al
	or	al, bl
	test	r8d, r8d
	setne	bl
	or	bl, al
	or	bl, -16
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], bl
	test	ecx, ecx
	je	.LBB3_17
# %bb.18:
	mov	al, byte ptr [r11]
	movsxd	rcx, dword ptr [rsi + 4]
	add	rsi, 8
	mov	byte ptr [rdi + rcx], al
	test	r9d, r9d
	je	.LBB3_21
.LBB3_20:
	mov	al, byte ptr [r11 + 1]
	movsxd	rcx, dword ptr [rsi]
	add	rsi, 4
	mov	byte ptr [rdi + rcx], al
.LBB3_21:
	test	edx, edx
	je	.LBB3_23
# %bb.22:
	mov	al, byte ptr [r11 + 2]
	movsxd	rcx, dword ptr [rsi]
	add	rsi, 4
	mov	byte ptr [rdi + rcx], al
.LBB3_23:
	test	r8d, r8d
	je	.LBB3_25
# %bb.24:
	mov	al, byte ptr [r11 + 3]
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	jmp	.LBB3_25
.LBB3_13:
	lea	eax, [r9 + 8]
	add	edx, 8
	or	eax, edx
	and	eax, -16
	or	eax, r8d
	je	.LBB3_14
# %bb.15:
	mov	eax, r15d
	shr	al
	or	al, -32
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	shl	r15d, 7
	lea	eax, [4*r9 + 64]
	or	eax, r15d
	mov	ecx, r14d
	shr	ecx, 3
	or	ecx, eax
	movsxd	rax, dword ptr [rsi + 4]
	mov	byte ptr [rdi + rax], cl
	shl	r14d, 7
	or	r10d, r14d
	movsxd	rax, dword ptr [rsi + 8]
	mov	byte ptr [rdi + rax], r10b
	jmp	.LBB3_25
.LBB3_17:
	add	rsi, 4
	test	r9d, r9d
	jne	.LBB3_20
	jmp	.LBB3_21
.LBB3_14:
	or	r15b, -64
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], r15b
	shl	r9d, 4
	sub	r9d, -128
	or	edx, r9d
	movsxd	rax, dword ptr [rsi + 4]
	mov	byte ptr [rdi + rax], dl
.LBB3_25:
	mov	eax, 1
	lea	rsp, [rbp - 40]
	pop	rbx
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	pop	rbp
	ret
.Lfunc_end3:
	.size	qoi_pixel_encoder, .Lfunc_end3-qoi_pixel_encoder
                                        # -- End function
	.ident	"clang version 10.0.0-4ubuntu1 "
	.section	".note.GNU-stack","",@progbits
	.addrsig
C code
#ifndef QOI_ENCODER_
#define QOI_ENCODER_

#include <stddef.h>
#include "qoi.h"

void pixel_cpy(char *dst, char *src, size_t sz)
{
    while (sz--)
    {
        *dst++ = *src++;
    }
}

int qoi_pixel_encoder(
    char *data, int *cur, int *run,
    const int x, const int y,
    const int maxX, const int maxY,
    const char *px_prev, char *px,
    char **index) // [64][4]
{
    qoi_rgba_t px_ = {.rgba = {
                          .r = px[0],
                          .g = px[1],
                          .b = px[2],
                          .a = px[3],
                      }};

    if (px == px_prev)
    {
        *run++;
    }

    int last_pixel = x == maxX - 1 && y == (maxY - 1);
    if (*run > 0 && *run == 0x2020 || px != px_prev || last_pixel)
    {
        if (*run < 33)
        {
            *(data + *cur++) = QOI_RUN_8 | *run - 1;
        }
        else
        {
            *run -= 33;
            *(data + *cur++) = QOI_RUN_16 | *run >> 8;
            *(data + *cur++) = *run & 0xFF;
        }
        *run = 0;
    }

    if (px != px_prev)
    {
        int index_pos = QOI_COLOR_HASH(px_);
        if (index[index_pos * 4] == px)
        {
            *(data + *cur++) = QOI_INDEX | index_pos;
        }
        else
        {
            pixel_cpy(index[index_pos * 4], px, 4);
            int vr = px[0] - px_prev[0];
            int vg = px[1] - px_prev[1];
            int vb = px[2] - px_prev[2];
            int va = px[3] - px_prev[3];

            if (
                vr > -17 && vr < 16 &&
                vg > -17 && vg < 16 &&
                vb > -17 && vb < 16 &&
                va > -17 && va < 16)
            {
                if (
                    va == 0 &&
                    vr > -3 && vr < 2 &&
                    vg > -3 && vg < 2 &&
                    vb > -3 && vb < 2)
                {
                    *(data + *cur++) = QOI_DIFF_8 | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
                }
                else if (
                    va == 0 &&
                    vr > -17 && vr < 16 &&
                    vg > -9 && vg < 8 &&
                    vb > -9 && vb < 8)
                {
                    *(data + *cur++) = QOI_DIFF_16 | (vr + 16);
                    *(data + *cur++) = (vg + 8) << 4 | (vb + 8);
                }
                else
                {
                    *(data + *cur++) = QOI_DIFF_24 | (vr + 16) >> 1;
                    *(data + *cur++) = (vr + 16) << 7 | (vg + 16) << 2 | (vb + 16) >> 3;
                    *(data + *cur++) = (vb + 16) << 7 | (va + 16);
                }
            }
            else
            {
                *(data + *cur++) = QOI_COLOR | (vr ? 8 : 0) | (vg ? 4 : 0) | (vb ? 2 : 0) | (va ? 1 : 0);
                if (vr)
                {
                    *(data + *cur++) = px[0];
                }
                if (vg)
                {
                    *(data + *cur++) = px[1];
                }
                if (vb)
                {
                    *(data + *cur++) = px[2];
                }
                if (va)
                {
                    *(data + *cur++) = px[3];
                }
            }
        }
        px_prev = px;
    }
    return 1;
}
#endif

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions