This repository was archived by the owner on Dec 1, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 115
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
Took forever to generate the goasm #30
Copy link
Copy link
Open
Description
Hi i am testing the c2goasm, for a simple function it works, but for bigger one it is took like forever, i wonder if there was a problem in my config or a bug?
build command: clang -S -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti $1
Generated CLang ASM
.text
.intel_syntax noprefix
.file "encoder.c"
.globl qoi_write_32 # -- Begin function qoi_write_32
.p2align 4, 0x90
.type qoi_write_32,@function
qoi_write_32: # @qoi_write_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
mov eax, edx
shr eax, 24
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
mov eax, edx
shr eax, 16
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dh
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dl
mov rsp, rbp
pop rbp
ret
.Lfunc_end0:
.size qoi_write_32, .Lfunc_end0-qoi_write_32
# -- End function
.globl qoi_read_32 # -- Begin function qoi_read_32
.p2align 4, 0x90
.type qoi_read_32,@function
qoi_read_32: # @qoi_read_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
movsxd rcx, dword ptr [rsi]
lea rax, [rcx + 1]
mov dword ptr [rsi], eax
movzx r8d, byte ptr [rdi + rcx]
lea rax, [rcx + 2]
mov dword ptr [rsi], eax
movzx r9d, byte ptr [rdi + rcx + 1]
lea rax, [rcx + 3]
mov dword ptr [rsi], eax
movzx eax, byte ptr [rdi + rcx + 2]
lea edx, [rcx + 4]
mov dword ptr [rsi], edx
movzx ecx, byte ptr [rdi + rcx + 3]
shl r8d, 24
shl r9d, 16
or r9d, r8d
shl eax, 8
or eax, r9d
or eax, ecx
mov rsp, rbp
pop rbp
ret
.Lfunc_end1:
.size qoi_read_32, .Lfunc_end1-qoi_read_32
# -- End function
.globl pixel_cpy # -- Begin function pixel_cpy
.p2align 4, 0x90
.type pixel_cpy,@function
pixel_cpy: # @pixel_cpy
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test rdx, rdx
je .LBB2_20
# %bb.1:
cmp rdx, 32
jb .LBB2_13
# %bb.2:
lea rax, [rsi + rdx]
cmp rax, rdi
jbe .LBB2_4
# %bb.3:
lea rax, [rdi + rdx]
cmp rax, rsi
jbe .LBB2_4
.LBB2_13:
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
je .LBB2_17
.LBB2_14:
xor ecx, ecx
.p2align 4, 0x90
.LBB2_15: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + rcx]
mov byte ptr [rdi + rcx], al
add rcx, 1
cmp r9, rcx
jne .LBB2_15
# %bb.16:
sub rdx, rcx
add rsi, rcx
add rdi, rcx
.LBB2_17:
cmp r8, 7
jb .LBB2_20
# %bb.18:
xor eax, eax
.p2align 4, 0x90
.LBB2_19: # =>This Inner Loop Header: Depth=1
movzx ecx, byte ptr [rsi + rax]
mov byte ptr [rdi + rax], cl
movzx ecx, byte ptr [rsi + rax + 1]
mov byte ptr [rdi + rax + 1], cl
movzx ecx, byte ptr [rsi + rax + 2]
mov byte ptr [rdi + rax + 2], cl
movzx ecx, byte ptr [rsi + rax + 3]
mov byte ptr [rdi + rax + 3], cl
movzx ecx, byte ptr [rsi + rax + 4]
mov byte ptr [rdi + rax + 4], cl
movzx ecx, byte ptr [rsi + rax + 5]
mov byte ptr [rdi + rax + 5], cl
movzx ecx, byte ptr [rsi + rax + 6]
mov byte ptr [rdi + rax + 6], cl
movzx ecx, byte ptr [rsi + rax + 7]
mov byte ptr [rdi + rax + 7], cl
add rax, 8
cmp rdx, rax
jne .LBB2_19
jmp .LBB2_20
.LBB2_4:
mov r8, rdx
and r8, -32
lea rax, [r8 - 32]
mov rcx, rax
shr rcx, 5
add rcx, 1
mov r9d, ecx
and r9d, 3
cmp rax, 96
jae .LBB2_6
# %bb.5:
xor eax, eax
jmp .LBB2_8
.LBB2_6:
sub rcx, r9
xor eax, eax
.p2align 4, 0x90
.LBB2_7: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax]
movups xmm1, xmmword ptr [rsi + rax + 16]
movups xmmword ptr [rdi + rax], xmm0
movups xmmword ptr [rdi + rax + 16], xmm1
movups xmm0, xmmword ptr [rsi + rax + 32]
movups xmm1, xmmword ptr [rsi + rax + 48]
movups xmmword ptr [rdi + rax + 32], xmm0
movups xmmword ptr [rdi + rax + 48], xmm1
movups xmm0, xmmword ptr [rsi + rax + 64]
movups xmm1, xmmword ptr [rsi + rax + 80]
movups xmmword ptr [rdi + rax + 64], xmm0
movups xmmword ptr [rdi + rax + 80], xmm1
movups xmm0, xmmword ptr [rsi + rax + 96]
movups xmm1, xmmword ptr [rsi + rax + 112]
movups xmmword ptr [rdi + rax + 96], xmm0
movups xmmword ptr [rdi + rax + 112], xmm1
sub rax, -128
add rcx, -4
jne .LBB2_7
.LBB2_8:
test r9, r9
je .LBB2_11
# %bb.9:
add rax, 16
neg r9
.p2align 4, 0x90
.LBB2_10: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax - 16]
movups xmm1, xmmword ptr [rsi + rax]
movups xmmword ptr [rdi + rax - 16], xmm0
movups xmmword ptr [rdi + rax], xmm1
add rax, 32
inc r9
jne .LBB2_10
.LBB2_11:
cmp r8, rdx
jne .LBB2_12
.LBB2_20:
mov eax, 1
mov rsp, rbp
pop rbp
ret
.LBB2_12:
and edx, 31
add rsi, r8
add rdi, r8
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
jne .LBB2_14
jmp .LBB2_17
.Lfunc_end2:
.size pixel_cpy, .Lfunc_end2-pixel_cpy
# -- End function
.globl qoi_pixel_encoder # -- Begin function qoi_pixel_encoder
.p2align 4, 0x90
.type qoi_pixel_encoder,@function
qoi_pixel_encoder: # @qoi_pixel_encoder
# %bb.0:
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push rbx
and rsp, -8
mov ebx, dword ptr [rbp + 16]
mov r10, qword ptr [rbp + 24]
mov r11, qword ptr [rbp + 32]
xor eax, eax
cmp r11, r10
sete al
add r9d, -1
xor r9d, ecx
add ebx, -1
xor ebx, r8d
or ebx, r9d
sete cl
mov r9b, byte ptr [r11]
mov r13b, byte ptr [r11 + 1]
mov r14b, byte ptr [r11 + 2]
mov r8b, byte ptr [r11 + 3]
mov ebx, dword ptr [rdx + 4*rax]
cmp r11, r10
jne .LBB3_3
# %bb.1:
test cl, cl
jne .LBB3_3
# %bb.2:
cmp ebx, 8224
jne .LBB3_7
.LBB3_3:
shl rax, 2
cmp ebx, 32
jg .LBB3_5
# %bb.4:
add bl, -1
or bl, 64
mov r12d, 1
mov r15, rsi
jmp .LBB3_6
.LBB3_5:
add ebx, -33
mov dword ptr [rdx + rax], ebx
shr ebx, 8
or bl, 96
lea r15, [rsi + 4]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], bl
mov bl, byte ptr [rdx + rax]
mov r12d, 2
.LBB3_6:
lea rsi, [rsi + 4*r12]
movsxd rcx, dword ptr [r15]
mov byte ptr [rdi + rcx], bl
mov dword ptr [rdx + rax], 0
.LBB3_7:
cmp r11, r10
je .LBB3_25
# %bb.8:
mov rax, qword ptr [rbp + 40]
xor r13b, r9b
xor r13b, r14b
xor r13b, r8b
movzx ecx, r13b
shl rcx, 5
mov rax, qword ptr [rax + rcx]
cmp rax, r11
je .LBB3_26
# %bb.9:
mov cl, byte ptr [r11]
mov byte ptr [rax], cl
mov cl, byte ptr [r11 + 1]
mov byte ptr [rax + 1], cl
mov cl, byte ptr [r11 + 2]
mov byte ptr [rax + 2], cl
mov cl, byte ptr [r11 + 3]
mov byte ptr [rax + 3], cl
movsx ecx, byte ptr [r11]
movsx eax, byte ptr [r10]
sub ecx, eax
movsx r9d, byte ptr [r11 + 1]
movsx eax, byte ptr [r10 + 1]
sub r9d, eax
movsx edx, byte ptr [r11 + 2]
movsx eax, byte ptr [r10 + 2]
sub edx, eax
movsx r8d, byte ptr [r11 + 3]
movsx eax, byte ptr [r10 + 3]
sub r8d, eax
lea r15d, [rcx + 16]
lea eax, [r9 + 16]
or eax, r15d
lea r14d, [rdx + 16]
lea r10d, [r8 + 16]
mov ebx, r14d
or ebx, r10d
or ebx, eax
cmp ebx, 32
jae .LBB3_16
# %bb.10:
lea r11d, [rdx + 2]
cmp r11d, 3
ja .LBB3_13
# %bb.11:
lea eax, [rcx + 2]
lea ebx, [r9 + 2]
or ebx, eax
and ebx, -4
or ebx, r8d
jne .LBB3_13
# %bb.12:
shl ecx, 4
add ecx, 32
lea eax, [4*r9 + 8]
or eax, ecx
or eax, r11d
or al, -128
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_26:
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r13b
jmp .LBB3_25
.LBB3_16:
test ecx, ecx
setne al
shl al, 3
test r9d, r9d
setne bl
shl bl, 2
or bl, al
test edx, edx
setne al
add al, al
or al, bl
test r8d, r8d
setne bl
or bl, al
or bl, -16
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], bl
test ecx, ecx
je .LBB3_17
# %bb.18:
mov al, byte ptr [r11]
movsxd rcx, dword ptr [rsi + 4]
add rsi, 8
mov byte ptr [rdi + rcx], al
test r9d, r9d
je .LBB3_21
.LBB3_20:
mov al, byte ptr [r11 + 1]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_21:
test edx, edx
je .LBB3_23
# %bb.22:
mov al, byte ptr [r11 + 2]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_23:
test r8d, r8d
je .LBB3_25
# %bb.24:
mov al, byte ptr [r11 + 3]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_13:
lea eax, [r9 + 8]
add edx, 8
or eax, edx
and eax, -16
or eax, r8d
je .LBB3_14
# %bb.15:
mov eax, r15d
shr al
or al, -32
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
shl r15d, 7
lea eax, [4*r9 + 64]
or eax, r15d
mov ecx, r14d
shr ecx, 3
or ecx, eax
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], cl
shl r14d, 7
or r10d, r14d
movsxd rax, dword ptr [rsi + 8]
mov byte ptr [rdi + rax], r10b
jmp .LBB3_25
.LBB3_17:
add rsi, 4
test r9d, r9d
jne .LBB3_20
jmp .LBB3_21
.LBB3_14:
or r15b, -64
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r15b
shl r9d, 4
sub r9d, -128
or edx, r9d
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], dl
.LBB3_25:
mov eax, 1
lea rsp, [rbp - 40]
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.Lfunc_end3:
.size qoi_pixel_encoder, .Lfunc_end3-qoi_pixel_encoder
# -- End function
.ident "clang version 10.0.0-4ubuntu1 "
.section ".note.GNU-stack","",@progbits
.addrsig
C code
#ifndef QOI_ENCODER_
#define QOI_ENCODER_
#include <stddef.h>
#include "qoi.h"
void pixel_cpy(char *dst, char *src, size_t sz)
{
while (sz--)
{
*dst++ = *src++;
}
}
int qoi_pixel_encoder(
char *data, int *cur, int *run,
const int x, const int y,
const int maxX, const int maxY,
const char *px_prev, char *px,
char **index) // [64][4]
{
qoi_rgba_t px_ = {.rgba = {
.r = px[0],
.g = px[1],
.b = px[2],
.a = px[3],
}};
if (px == px_prev)
{
*run++;
}
int last_pixel = x == maxX - 1 && y == (maxY - 1);
if (*run > 0 && *run == 0x2020 || px != px_prev || last_pixel)
{
if (*run < 33)
{
*(data + *cur++) = QOI_RUN_8 | *run - 1;
}
else
{
*run -= 33;
*(data + *cur++) = QOI_RUN_16 | *run >> 8;
*(data + *cur++) = *run & 0xFF;
}
*run = 0;
}
if (px != px_prev)
{
int index_pos = QOI_COLOR_HASH(px_);
if (index[index_pos * 4] == px)
{
*(data + *cur++) = QOI_INDEX | index_pos;
}
else
{
pixel_cpy(index[index_pos * 4], px, 4);
int vr = px[0] - px_prev[0];
int vg = px[1] - px_prev[1];
int vb = px[2] - px_prev[2];
int va = px[3] - px_prev[3];
if (
vr > -17 && vr < 16 &&
vg > -17 && vg < 16 &&
vb > -17 && vb < 16 &&
va > -17 && va < 16)
{
if (
va == 0 &&
vr > -3 && vr < 2 &&
vg > -3 && vg < 2 &&
vb > -3 && vb < 2)
{
*(data + *cur++) = QOI_DIFF_8 | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
}
else if (
va == 0 &&
vr > -17 && vr < 16 &&
vg > -9 && vg < 8 &&
vb > -9 && vb < 8)
{
*(data + *cur++) = QOI_DIFF_16 | (vr + 16);
*(data + *cur++) = (vg + 8) << 4 | (vb + 8);
}
else
{
*(data + *cur++) = QOI_DIFF_24 | (vr + 16) >> 1;
*(data + *cur++) = (vr + 16) << 7 | (vg + 16) << 2 | (vb + 16) >> 3;
*(data + *cur++) = (vb + 16) << 7 | (va + 16);
}
}
else
{
*(data + *cur++) = QOI_COLOR | (vr ? 8 : 0) | (vg ? 4 : 0) | (vb ? 2 : 0) | (va ? 1 : 0);
if (vr)
{
*(data + *cur++) = px[0];
}
if (vg)
{
*(data + *cur++) = px[1];
}
if (vb)
{
*(data + *cur++) = px[2];
}
if (va)
{
*(data + *cur++) = px[3];
}
}
}
px_prev = px;
}
return 1;
}
#endif
Metadata
Metadata
Assignees
Labels
No labels