Skip to content

Commit 5f396f2

Browse files
authored
perf: adds (some) arm NEON assembly for koalabear and babybear (#588)
1 parent fd88e8b commit 5f396f2

File tree

19 files changed

+656
-32
lines changed

19 files changed

+656
-32
lines changed

field/asm/element_31b_arm64.s

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// Code generated by gnark-crypto/generator. DO NOT EDIT.
2+
#include "textflag.h"
3+
#include "funcdata.h"
4+
#include "go_asm.h"
5+
6+
// addVec(res, a, b *Element, n uint64)
7+
// n is the number of blocks of 4 uint32 to process
8+
TEXT ·addVec(SB), NOFRAME|NOSPLIT, $0-32
9+
LDP res+0(FP), (R0, R1)
10+
LDP b+16(FP), (R2, R3)
11+
VMOVS $const_q, V3
12+
VDUP V3.S[0], V3.S4 // broadcast q into V3
13+
14+
loop1:
15+
CBZ R3, done2
16+
VLD1.P 16(R1), [V0.S4]
17+
VLD1.P 16(R2), [V1.S4]
18+
VADD V0.S4, V1.S4, V1.S4 // b = a + b
19+
VSUB V3.S4, V1.S4, V2.S4 // t = b - q
20+
VUMIN V2.S4, V1.S4, V1.S4 // b = min(t, b)
21+
VST1.P [V1.S4], 16(R0) // res = b
22+
SUB $1, R3, R3
23+
JMP loop1
24+
25+
done2:
26+
RET
27+
28+
// subVec(res, a, b *Element, n uint64)
29+
// n is the number of blocks of 4 uint32 to process
30+
TEXT ·subVec(SB), NOFRAME|NOSPLIT, $0-32
31+
LDP res+0(FP), (R0, R1)
32+
LDP b+16(FP), (R2, R3)
33+
VMOVS $const_q, V3
34+
VDUP V3.S[0], V3.S4 // broadcast q into V3
35+
36+
loop3:
37+
CBZ R3, done4
38+
VLD1.P 16(R1), [V0.S4]
39+
VLD1.P 16(R2), [V1.S4]
40+
VSUB V1.S4, V0.S4, V1.S4 // b = a - b
41+
VADD V1.S4, V3.S4, V2.S4 // t = b + q
42+
VUMIN V2.S4, V1.S4, V1.S4 // b = min(t, b)
43+
VST1.P [V1.S4], 16(R0) // res = b
44+
SUB $1, R3, R3
45+
JMP loop3
46+
47+
done4:
48+
RET
49+
50+
// sumVec(t *uint64, a *[]uint32, n uint64) res = sum(a[0...n])
51+
// n is the number of blocks of 16 uint32 to process
52+
TEXT ·sumVec(SB), NOFRAME|NOSPLIT, $0-24
53+
// zeroing accumulators
54+
VMOVQ $0, $0, V4
55+
VMOVQ $0, $0, V5
56+
VMOVQ $0, $0, V6
57+
VMOVQ $0, $0, V7
58+
LDP t+0(FP), (R1, R0)
59+
MOVD n+16(FP), R2
60+
61+
loop5:
62+
CBZ R2, done6
63+
64+
// blockSize is 16 uint32; we load 4 vectors of 4 uint32 at a time
65+
// (4*4)*4 = 64 bytes ~= 1 cache line
66+
// since our values are 31 bits, we can add 2 by 2 these vectors
67+
// we are left with 2 vectors of 4x32 bits values
68+
// that we accumulate in 4*2*64bits accumulators
69+
// the caller will reduce mod q the accumulators.
70+
71+
VLD2.P 32(R0), [V0.S4, V1.S4]
72+
VADD V0.S4, V1.S4, V0.S4 // a1 += a2
73+
VLD2.P 32(R0), [V2.S4, V3.S4]
74+
VADD V2.S4, V3.S4, V2.S4 // a3 += a4
75+
VUSHLL $0, V0.S2, V1.D2 // convert low words to 64 bits
76+
VADD V1.D2, V5.D2, V5.D2 // acc2 += a2
77+
VUSHLL2 $0, V0.S4, V0.D2 // convert high words to 64 bits
78+
VADD V0.D2, V4.D2, V4.D2 // acc1 += a1
79+
VUSHLL $0, V2.S2, V3.D2 // convert low words to 64 bits
80+
VADD V3.D2, V7.D2, V7.D2 // acc4 += a4
81+
VUSHLL2 $0, V2.S4, V2.D2 // convert high words to 64 bits
82+
VADD V2.D2, V6.D2, V6.D2 // acc3 += a3
83+
SUB $1, R2, R2
84+
JMP loop5
85+
86+
done6:
87+
VADD V4.D2, V6.D2, V4.D2 // acc1 += acc3
88+
VADD V5.D2, V7.D2, V5.D2 // acc2 += acc4
89+
VST2.P [V4.D2, V5.D2], 0(R1) // store acc1 and acc2
90+
RET

field/babybear/doc.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

field/babybear/element_arm64.s

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
//go:build !purego
2+
3+
// Copyright 2020-2024 Consensys Software Inc.
4+
// Licensed under the Apache License, Version 2.0. See the LICENSE file for details.
5+
6+
// Code generated by consensys/gnark-crypto DO NOT EDIT
7+
8+
// We include the hash to force the Go compiler to recompile: 8620676634583589757
9+
#include "../asm/element_31b_arm64.s"
10+

field/babybear/vector_amd64.go

Lines changed: 2 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

field/babybear/vector_arm64.go

Lines changed: 104 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

field/babybear/vector_purego.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

field/generator/asm/amd64/build.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,8 @@ func GenerateCommonASM(w io.Writer, nbWords, nbBits int, hasVector bool) error {
221221
if nbBits == 31 {
222222
return GenerateF31ASM(f, hasVector)
223223
} else {
224-
fmt.Printf("nbWords: %d, nbBits: %d\n", nbWords, nbBits)
225224
panic("not implemented")
226225
}
227-
228226
}
229227

230228
f.GenerateReduceDefine()

field/generator/asm/arm64/build.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@ func GenerateCommonASM(w io.Writer, nbWords, nbBits int, hasVector bool) error {
5555
f.WriteLn("#include \"go_asm.h\"")
5656
f.WriteLn("")
5757

58+
if nbWords == 1 {
59+
if nbBits == 31 {
60+
return GenerateF31ASM(f, hasVector)
61+
} else {
62+
panic("not implemented")
63+
}
64+
}
65+
5866
if f.NbWords%2 != 0 {
5967
panic("NbWords must be even")
6068
}
@@ -216,3 +224,15 @@ func ElementASMFileName(nbWords, nbBits int) string {
216224
}
217225
return fmt.Sprintf(nameWN, nbWords)
218226
}
227+
228+
func GenerateF31ASM(f *FFArm64, hasVector bool) error {
229+
if !hasVector {
230+
return nil // nothing for now.
231+
}
232+
233+
f.generateAddVecF31()
234+
f.generateSubVecF31()
235+
f.generateSumVecF31()
236+
237+
return nil
238+
}

0 commit comments

Comments
 (0)