Skip to content

Commit 97d4827

Browse files
authored
perf(levm): add an implementation of BLAKE2B that uses NEON (#3850)
**Motivation** In ARM machines, the implementation used always falls back to the default because AVX2 is x86-specific. **Description** Implement a NEON-specific version of blake2 (equivalent to the one for AVX2, but for NEON). <!-- Link to issues: Resolves #111, Resolves #222 --> closes #4315
1 parent e5a44ec commit 97d4827

File tree

3 files changed

+305
-1
lines changed

3 files changed

+305
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060

6161
- Make `JUMPDEST` blacklist lazily generated on-demand [#3812](https://github.com/lambdaclass/ethrex/pull/3812)
6262
- Rewrite Blake2 AVX2 implementation (avoid gather instructions and better loop handling).
63+
- Add Blake2 NEON implementation.
6364

6465
### 2025-07-30
6566

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
use std::arch::aarch64::*;
2+
3+
const BLAKE2B_IV: [u64; 12] = [
4+
0x6A09E667F3BCC908,
5+
0xBB67AE8584CAA73B,
6+
0x3C6EF372FE94F82B,
7+
0xA54FF53A5F1D36F1,
8+
0x510E527FADE682D1,
9+
0x9B05688C2B3E6C1F,
10+
0x1F83D9ABFB41BD6B,
11+
0x5BE0CD19137E2179,
12+
// Second half of blake2b_iv with inverted bits (for final block).
13+
0x510E527FADE682D1,
14+
0x9B05688C2B3E6C1F,
15+
0xE07C265404BE4294,
16+
0x5BE0CD19137E2179,
17+
];
18+
19+
pub fn blake2b_f(r: usize, h: &mut [u64; 8], m: &[u64; 16], t: &[u64; 2], f: bool) {
20+
unsafe {
21+
// Initialize local work vector.
22+
let uint64x2x4_t(h0, h1, h2, h3) = vld1q_u64_x4(h.as_ptr().cast::<u64>().add(0));
23+
let mut a = uint64x2x2_t(h0, h1);
24+
let mut b = uint64x2x2_t(h2, h3);
25+
let mut c = vld1q_u64_x2(BLAKE2B_IV.as_ptr());
26+
let mut d = vld1q_u64_x2(BLAKE2B_IV.as_ptr().add(4 + ((f as usize) << 2)));
27+
28+
// Apply block number to local work vector.
29+
d.0 = veorq_u64(d.0, vld1q_u64(t.as_ptr()));
30+
31+
if let Some(mut r) = r.checked_sub(1) {
32+
let uint64x2x4_t(m0, m1, m2, m3) = vld1q_u64_x4(m.as_ptr().add(0));
33+
let uint64x2x4_t(m4, m5, m6, m7) = vld1q_u64_x4(m.as_ptr().add(8));
34+
35+
'process: {
36+
// Round #0:
37+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
38+
// Into: [0 2 4 6 1 3 5 7 E 8 A C F 9 B D]
39+
let r0a = uint64x2x2_t(vtrn1q_u64(m0, m1), vtrn1q_u64(m2, m3));
40+
let r0b = uint64x2x2_t(vtrn2q_u64(m0, m1), vtrn2q_u64(m2, m3));
41+
let r0c = uint64x2x2_t(vtrn1q_u64(m7, m4), vtrn1q_u64(m5, m6));
42+
let r0d = uint64x2x2_t(vtrn2q_u64(m7, m4), vtrn2q_u64(m5, m6));
43+
inner(&mut a, &mut b, &mut c, &mut d, r0a, r0b, r0c, r0d);
44+
r = match r.checked_sub(1) {
45+
Some(x) => x,
46+
None => break 'process,
47+
};
48+
49+
// Round #1:
50+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
51+
// Into: [E 4 9 D A 8 F 6 5 1 0 B 3 C 2 7]
52+
let r1a = uint64x2x2_t(vtrn1q_u64(m7, m2), vtrn2q_u64(m4, m6));
53+
let r1b = uint64x2x2_t(vtrn1q_u64(m5, m4), vextq_u64::<1>(m7, m3));
54+
let r1c = uint64x2x2_t(vtrn2q_u64(m2, m0), vcopyq_laneq_u64::<1, 1>(m0, m5));
55+
let r1d = uint64x2x2_t(vextq_u64::<1>(m1, m6), vcopyq_laneq_u64::<1, 1>(m1, m3));
56+
inner(&mut a, &mut b, &mut c, &mut d, r1a, r1b, r1c, r1d);
57+
r = match r.checked_sub(1) {
58+
Some(x) => x,
59+
None => break 'process,
60+
};
61+
62+
// Round #2:
63+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
64+
// Into: [B C 5 F 8 0 2 D 9 A 3 7 4 E 6 1]
65+
let r2a = uint64x2x2_t(vextq_u64::<1>(m5, m6), vtrn2q_u64(m2, m7));
66+
let r2b = uint64x2x2_t(vtrn1q_u64(m4, m0), vcopyq_laneq_u64::<1, 1>(m1, m6));
67+
let r2c = uint64x2x2_t(vextq_u64::<1>(m4, m5), vtrn2q_u64(m1, m3));
68+
let r2d = uint64x2x2_t(vtrn1q_u64(m2, m7), vcopyq_laneq_u64::<1, 1>(m3, m0));
69+
inner(&mut a, &mut b, &mut c, &mut d, r2a, r2b, r2c, r2d);
70+
r = match r.checked_sub(1) {
71+
Some(x) => x,
72+
None => break 'process,
73+
};
74+
75+
// Round #3:
76+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
77+
// Into: [7 3 D B 9 1 C E F 2 5 4 8 6 A 0]
78+
let r3a = uint64x2x2_t(vtrn2q_u64(m3, m1), vtrn2q_u64(m6, m5));
79+
let r3b = uint64x2x2_t(vtrn2q_u64(m4, m0), vtrn1q_u64(m6, m7));
80+
let r3c = uint64x2x2_t(vextq_u64::<1>(m7, m1), vextq_u64::<1>(m2, m2));
81+
let r3d = uint64x2x2_t(vtrn1q_u64(m4, m3), vtrn1q_u64(m5, m0));
82+
inner(&mut a, &mut b, &mut c, &mut d, r3a, r3b, r3c, r3d);
83+
r = match r.checked_sub(1) {
84+
Some(x) => x,
85+
None => break 'process,
86+
};
87+
88+
// Round #4:
89+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
90+
// Into: [9 5 2 A 0 7 4 F 3 E B 6 D 1 C 8]
91+
let r4a = uint64x2x2_t(vtrn2q_u64(m4, m2), vtrn1q_u64(m1, m5));
92+
let r4b = uint64x2x2_t(
93+
vcopyq_laneq_u64::<1, 1>(m0, m3),
94+
vcopyq_laneq_u64::<1, 1>(m2, m7),
95+
);
96+
let r4c = uint64x2x2_t(vextq_u64::<1>(m1, m7), vextq_u64::<1>(m5, m3));
97+
let r4d = uint64x2x2_t(vtrn2q_u64(m6, m0), vtrn1q_u64(m6, m4));
98+
inner(&mut a, &mut b, &mut c, &mut d, r4a, r4b, r4c, r4d);
99+
r = match r.checked_sub(1) {
100+
Some(x) => x,
101+
None => break 'process,
102+
};
103+
104+
// Round #5:
105+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
106+
// Into: [2 6 0 8 C A B 3 1 4 7 F 9 D 5 E]
107+
let r5a = uint64x2x2_t(vtrn1q_u64(m1, m3), vtrn1q_u64(m0, m4));
108+
let r5b = uint64x2x2_t(vtrn1q_u64(m6, m5), vtrn2q_u64(m5, m1));
109+
let r5c = uint64x2x2_t(vextq_u64::<1>(m0, m2), vtrn2q_u64(m3, m7));
110+
let r5d = uint64x2x2_t(vtrn2q_u64(m4, m6), vextq_u64::<1>(m2, m7));
111+
inner(&mut a, &mut b, &mut c, &mut d, r5a, r5b, r5c, r5d);
112+
r = match r.checked_sub(1) {
113+
Some(x) => x,
114+
None => break 'process,
115+
};
116+
117+
// Round #6:
118+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
119+
// Into: [C 1 E 4 5 F D A 8 0 6 9 B 7 3 2]
120+
let r6a = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m6, m0), vtrn1q_u64(m7, m2));
121+
let r6b = uint64x2x2_t(vtrn2q_u64(m2, m7), vextq_u64::<1>(m6, m5));
122+
let r6c = uint64x2x2_t(vtrn1q_u64(m4, m0), vcopyq_laneq_u64::<1, 1>(m3, m4));
123+
let r6d = uint64x2x2_t(vtrn2q_u64(m5, m3), vextq_u64::<1>(m1, m1));
124+
inner(&mut a, &mut b, &mut c, &mut d, r6a, r6b, r6c, r6d);
125+
r = match r.checked_sub(1) {
126+
Some(x) => x,
127+
None => break 'process,
128+
};
129+
130+
// Round #7:
131+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
132+
// Into: [D 7 C 3 B E 1 9 2 5 F 8 A 0 4 6]
133+
let r7a = uint64x2x2_t(vtrn2q_u64(m6, m3), vcopyq_laneq_u64::<1, 1>(m6, m1));
134+
let r7b = uint64x2x2_t(vextq_u64::<1>(m5, m7), vtrn2q_u64(m0, m4));
135+
let r7c = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m1, m2), vextq_u64::<1>(m7, m4));
136+
let r7d = uint64x2x2_t(vtrn1q_u64(m5, m0), vtrn1q_u64(m2, m3));
137+
inner(&mut a, &mut b, &mut c, &mut d, r7a, r7b, r7c, r7d);
138+
r = match r.checked_sub(1) {
139+
Some(x) => x,
140+
None => break 'process,
141+
};
142+
143+
// Round #8:
144+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
145+
// Into: [6 E B 0 F 9 3 8 A C D 1 5 2 7 4]
146+
let r8a = uint64x2x2_t(vtrn1q_u64(m3, m7), vextq_u64::<1>(m5, m0));
147+
let r8b = uint64x2x2_t(vtrn2q_u64(m7, m4), vextq_u64::<1>(m1, m4));
148+
let r8c = uint64x2x2_t(vtrn1q_u64(m5, m6), vtrn2q_u64(m6, m0));
149+
let r8d = uint64x2x2_t(vextq_u64::<1>(m2, m1), vextq_u64::<1>(m3, m2));
150+
inner(&mut a, &mut b, &mut c, &mut d, r8a, r8b, r8c, r8d);
151+
r = match r.checked_sub(1) {
152+
Some(x) => x,
153+
None => break 'process,
154+
};
155+
156+
// Round #9:
157+
// From: [0 1 2 3 4 5 6 7 8 9 A B C D E F]
158+
// Into: [A 8 7 1 2 4 6 5 D F 9 3 0 B E C]
159+
let r9a = uint64x2x2_t(vtrn1q_u64(m5, m4), vtrn2q_u64(m3, m0));
160+
let r9b = uint64x2x2_t(vtrn1q_u64(m1, m2), vcopyq_laneq_u64::<1, 1>(m3, m2));
161+
let r9c = uint64x2x2_t(vtrn2q_u64(m6, m7), vtrn2q_u64(m4, m1));
162+
let r9d = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m0, m5), vtrn1q_u64(m7, m6));
163+
inner(&mut a, &mut b, &mut c, &mut d, r9a, r9b, r9c, r9d);
164+
r = match r.checked_sub(1) {
165+
Some(x) => x,
166+
None => break 'process,
167+
};
168+
169+
loop {
170+
inner(&mut a, &mut b, &mut c, &mut d, r0a, r0b, r0c, r0d);
171+
r = match r.checked_sub(1) {
172+
Some(x) => x,
173+
None => break 'process,
174+
};
175+
176+
inner(&mut a, &mut b, &mut c, &mut d, r1a, r1b, r1c, r1d);
177+
r = match r.checked_sub(1) {
178+
Some(x) => x,
179+
None => break 'process,
180+
};
181+
182+
inner(&mut a, &mut b, &mut c, &mut d, r2a, r2b, r2c, r2d);
183+
r = match r.checked_sub(1) {
184+
Some(x) => x,
185+
None => break 'process,
186+
};
187+
188+
inner(&mut a, &mut b, &mut c, &mut d, r3a, r3b, r3c, r3d);
189+
r = match r.checked_sub(1) {
190+
Some(x) => x,
191+
None => break 'process,
192+
};
193+
194+
inner(&mut a, &mut b, &mut c, &mut d, r4a, r4b, r4c, r4d);
195+
r = match r.checked_sub(1) {
196+
Some(x) => x,
197+
None => break 'process,
198+
};
199+
200+
inner(&mut a, &mut b, &mut c, &mut d, r5a, r5b, r5c, r5d);
201+
r = match r.checked_sub(1) {
202+
Some(x) => x,
203+
None => break 'process,
204+
};
205+
206+
inner(&mut a, &mut b, &mut c, &mut d, r6a, r6b, r6c, r6d);
207+
r = match r.checked_sub(1) {
208+
Some(x) => x,
209+
None => break 'process,
210+
};
211+
212+
inner(&mut a, &mut b, &mut c, &mut d, r7a, r7b, r7c, r7d);
213+
r = match r.checked_sub(1) {
214+
Some(x) => x,
215+
None => break 'process,
216+
};
217+
218+
inner(&mut a, &mut b, &mut c, &mut d, r8a, r8b, r8c, r8d);
219+
r = match r.checked_sub(1) {
220+
Some(x) => x,
221+
None => break 'process,
222+
};
223+
224+
inner(&mut a, &mut b, &mut c, &mut d, r9a, r9b, r9c, r9d);
225+
r = match r.checked_sub(1) {
226+
Some(x) => x,
227+
None => break 'process,
228+
};
229+
}
230+
}
231+
}
232+
233+
// Merge local work vector.
234+
vst1q_u64_x2(
235+
h.as_mut_ptr().add(0),
236+
uint64x2x2_t(veor3q_u64(h0, a.0, c.0), veor3q_u64(h1, a.1, c.1)),
237+
);
238+
vst1q_u64_x2(
239+
h.as_mut_ptr().add(4),
240+
uint64x2x2_t(veor3q_u64(h2, b.0, d.0), veor3q_u64(h3, b.1, d.1)),
241+
);
242+
}
243+
}
244+
245+
#[allow(clippy::too_many_arguments)]
246+
#[inline(always)]
247+
fn inner(
248+
a: &mut uint64x2x2_t,
249+
b: &mut uint64x2x2_t,
250+
c: &mut uint64x2x2_t,
251+
d: &mut uint64x2x2_t,
252+
d0: uint64x2x2_t,
253+
d1: uint64x2x2_t,
254+
d2: uint64x2x2_t,
255+
d3: uint64x2x2_t,
256+
) {
257+
unsafe {
258+
// G(d0)
259+
*a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1));
260+
*a = uint64x2x2_t(vaddq_u64(a.0, d0.0), vaddq_u64(a.1, d0.1));
261+
*d = uint64x2x2_t(vxarq_u64::<32>(d.0, a.0), vxarq_u64::<32>(d.1, a.1));
262+
*c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0));
263+
*b = uint64x2x2_t(vxarq_u64::<24>(b.0, c.0), vxarq_u64::<24>(b.1, c.1));
264+
265+
// G(d1)
266+
*a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1));
267+
*a = uint64x2x2_t(vaddq_u64(a.0, d1.0), vaddq_u64(a.1, d1.1));
268+
*d = uint64x2x2_t(vxarq_u64::<16>(d.0, a.0), vxarq_u64::<32>(d.1, a.1));
269+
*c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0));
270+
*b = uint64x2x2_t(vxarq_u64::<63>(b.0, c.0), vxarq_u64::<24>(b.1, c.1));
271+
272+
// Apply diagonalization.
273+
*b = uint64x2x2_t(vextq_u64::<1>(a.0, a.1), vextq_u64::<1>(a.1, a.0));
274+
*c = uint64x2x2_t(c.1, c.0);
275+
*d = uint64x2x2_t(vextq_u64::<1>(d.1, d.0), vextq_u64::<1>(d.0, d.1));
276+
277+
// G(d2)
278+
*a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1));
279+
*a = uint64x2x2_t(vaddq_u64(a.0, d2.0), vaddq_u64(a.1, d2.1));
280+
*d = uint64x2x2_t(vxarq_u64::<32>(d.0, a.0), vxarq_u64::<32>(d.1, a.1));
281+
*c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0));
282+
*b = uint64x2x2_t(vxarq_u64::<24>(b.0, c.0), vxarq_u64::<24>(b.1, c.1));
283+
284+
// G(d3)
285+
*a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1));
286+
*a = uint64x2x2_t(vaddq_u64(a.0, d3.0), vaddq_u64(a.1, d3.1));
287+
*d = uint64x2x2_t(vxarq_u64::<16>(d.0, a.0), vxarq_u64::<32>(d.1, a.1));
288+
*c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0));
289+
*b = uint64x2x2_t(vxarq_u64::<63>(b.0, c.0), vxarq_u64::<24>(b.1, c.1));
290+
291+
// Revert diagonalization.
292+
*b = uint64x2x2_t(vextq_u64::<1>(a.1, a.0), vextq_u64::<1>(a.0, a.1));
293+
*c = uint64x2x2_t(c.1, c.0);
294+
*d = uint64x2x2_t(vextq_u64::<1>(d.0, d.1), vextq_u64::<1>(d.1, d.0));
295+
}
296+
}

crates/common/crypto/blake2f/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
use std::sync::LazyLock;
22

3+
#[cfg(target_arch = "aarch64")]
4+
mod aarch64;
35
mod portable;
46
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
57
mod x86_64;
68

79
type Blake2Func = fn(usize, &mut [u64; 8], &[u64; 16], &[u64; 2], bool);
810

911
static BLAKE2_FUNC: LazyLock<Blake2Func> = LazyLock::new(|| {
12+
#[cfg(target_arch = "aarch64")]
13+
if std::arch::is_aarch64_feature_detected!("neon") {
14+
return self::aarch64::blake2b_f;
15+
}
16+
1017
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
11-
if is_x86_feature_detected!("avx2") {
18+
if std::arch::is_x86_feature_detected!("avx2") {
1219
return self::x86_64::blake2b_f;
1320
}
1421

0 commit comments

Comments
 (0)