|
| 1 | +use std::arch::aarch64::*; |
| 2 | + |
| 3 | +const BLAKE2B_IV: [u64; 12] = [ |
| 4 | + 0x6A09E667F3BCC908, |
| 5 | + 0xBB67AE8584CAA73B, |
| 6 | + 0x3C6EF372FE94F82B, |
| 7 | + 0xA54FF53A5F1D36F1, |
| 8 | + 0x510E527FADE682D1, |
| 9 | + 0x9B05688C2B3E6C1F, |
| 10 | + 0x1F83D9ABFB41BD6B, |
| 11 | + 0x5BE0CD19137E2179, |
| 12 | + // Second half of blake2b_iv with inverted bits (for final block). |
| 13 | + 0x510E527FADE682D1, |
| 14 | + 0x9B05688C2B3E6C1F, |
| 15 | + 0xE07C265404BE4294, |
| 16 | + 0x5BE0CD19137E2179, |
| 17 | +]; |
| 18 | + |
| 19 | +pub fn blake2b_f(r: usize, h: &mut [u64; 8], m: &[u64; 16], t: &[u64; 2], f: bool) { |
| 20 | + unsafe { |
| 21 | + // Initialize local work vector. |
| 22 | + let uint64x2x4_t(h0, h1, h2, h3) = vld1q_u64_x4(h.as_ptr().cast::<u64>().add(0)); |
| 23 | + let mut a = uint64x2x2_t(h0, h1); |
| 24 | + let mut b = uint64x2x2_t(h2, h3); |
| 25 | + let mut c = vld1q_u64_x2(BLAKE2B_IV.as_ptr()); |
| 26 | + let mut d = vld1q_u64_x2(BLAKE2B_IV.as_ptr().add(4 + ((f as usize) << 2))); |
| 27 | + |
| 28 | + // Apply block number to local work vector. |
| 29 | + d.0 = veorq_u64(d.0, vld1q_u64(t.as_ptr())); |
| 30 | + |
| 31 | + if let Some(mut r) = r.checked_sub(1) { |
| 32 | + let uint64x2x4_t(m0, m1, m2, m3) = vld1q_u64_x4(m.as_ptr().add(0)); |
| 33 | + let uint64x2x4_t(m4, m5, m6, m7) = vld1q_u64_x4(m.as_ptr().add(8)); |
| 34 | + |
| 35 | + 'process: { |
| 36 | + // Round #0: |
| 37 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 38 | + // Into: [0 2 4 6 1 3 5 7 E 8 A C F 9 B D] |
| 39 | + let r0a = uint64x2x2_t(vtrn1q_u64(m0, m1), vtrn1q_u64(m2, m3)); |
| 40 | + let r0b = uint64x2x2_t(vtrn2q_u64(m0, m1), vtrn2q_u64(m2, m3)); |
| 41 | + let r0c = uint64x2x2_t(vtrn1q_u64(m7, m4), vtrn1q_u64(m5, m6)); |
| 42 | + let r0d = uint64x2x2_t(vtrn2q_u64(m7, m4), vtrn2q_u64(m5, m6)); |
| 43 | + inner(&mut a, &mut b, &mut c, &mut d, r0a, r0b, r0c, r0d); |
| 44 | + r = match r.checked_sub(1) { |
| 45 | + Some(x) => x, |
| 46 | + None => break 'process, |
| 47 | + }; |
| 48 | + |
| 49 | + // Round #1: |
| 50 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 51 | + // Into: [E 4 9 D A 8 F 6 5 1 0 B 3 C 2 7] |
| 52 | + let r1a = uint64x2x2_t(vtrn1q_u64(m7, m2), vtrn2q_u64(m4, m6)); |
| 53 | + let r1b = uint64x2x2_t(vtrn1q_u64(m5, m4), vextq_u64::<1>(m7, m3)); |
| 54 | + let r1c = uint64x2x2_t(vtrn2q_u64(m2, m0), vcopyq_laneq_u64::<1, 1>(m0, m5)); |
| 55 | + let r1d = uint64x2x2_t(vextq_u64::<1>(m1, m6), vcopyq_laneq_u64::<1, 1>(m1, m3)); |
| 56 | + inner(&mut a, &mut b, &mut c, &mut d, r1a, r1b, r1c, r1d); |
| 57 | + r = match r.checked_sub(1) { |
| 58 | + Some(x) => x, |
| 59 | + None => break 'process, |
| 60 | + }; |
| 61 | + |
| 62 | + // Round #2: |
| 63 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 64 | + // Into: [B C 5 F 8 0 2 D 9 A 3 7 4 E 6 1] |
| 65 | + let r2a = uint64x2x2_t(vextq_u64::<1>(m5, m6), vtrn2q_u64(m2, m7)); |
| 66 | + let r2b = uint64x2x2_t(vtrn1q_u64(m4, m0), vcopyq_laneq_u64::<1, 1>(m1, m6)); |
| 67 | + let r2c = uint64x2x2_t(vextq_u64::<1>(m4, m5), vtrn2q_u64(m1, m3)); |
| 68 | + let r2d = uint64x2x2_t(vtrn1q_u64(m2, m7), vcopyq_laneq_u64::<1, 1>(m3, m0)); |
| 69 | + inner(&mut a, &mut b, &mut c, &mut d, r2a, r2b, r2c, r2d); |
| 70 | + r = match r.checked_sub(1) { |
| 71 | + Some(x) => x, |
| 72 | + None => break 'process, |
| 73 | + }; |
| 74 | + |
| 75 | + // Round #3: |
| 76 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 77 | + // Into: [7 3 D B 9 1 C E F 2 5 4 8 6 A 0] |
| 78 | + let r3a = uint64x2x2_t(vtrn2q_u64(m3, m1), vtrn2q_u64(m6, m5)); |
| 79 | + let r3b = uint64x2x2_t(vtrn2q_u64(m4, m0), vtrn1q_u64(m6, m7)); |
| 80 | + let r3c = uint64x2x2_t(vextq_u64::<1>(m7, m1), vextq_u64::<1>(m2, m2)); |
| 81 | + let r3d = uint64x2x2_t(vtrn1q_u64(m4, m3), vtrn1q_u64(m5, m0)); |
| 82 | + inner(&mut a, &mut b, &mut c, &mut d, r3a, r3b, r3c, r3d); |
| 83 | + r = match r.checked_sub(1) { |
| 84 | + Some(x) => x, |
| 85 | + None => break 'process, |
| 86 | + }; |
| 87 | + |
| 88 | + // Round #4: |
| 89 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 90 | + // Into: [9 5 2 A 0 7 4 F 3 E B 6 D 1 C 8] |
| 91 | + let r4a = uint64x2x2_t(vtrn2q_u64(m4, m2), vtrn1q_u64(m1, m5)); |
| 92 | + let r4b = uint64x2x2_t( |
| 93 | + vcopyq_laneq_u64::<1, 1>(m0, m3), |
| 94 | + vcopyq_laneq_u64::<1, 1>(m2, m7), |
| 95 | + ); |
| 96 | + let r4c = uint64x2x2_t(vextq_u64::<1>(m1, m7), vextq_u64::<1>(m5, m3)); |
| 97 | + let r4d = uint64x2x2_t(vtrn2q_u64(m6, m0), vtrn1q_u64(m6, m4)); |
| 98 | + inner(&mut a, &mut b, &mut c, &mut d, r4a, r4b, r4c, r4d); |
| 99 | + r = match r.checked_sub(1) { |
| 100 | + Some(x) => x, |
| 101 | + None => break 'process, |
| 102 | + }; |
| 103 | + |
| 104 | + // Round #5: |
| 105 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 106 | + // Into: [2 6 0 8 C A B 3 1 4 7 F 9 D 5 E] |
| 107 | + let r5a = uint64x2x2_t(vtrn1q_u64(m1, m3), vtrn1q_u64(m0, m4)); |
| 108 | + let r5b = uint64x2x2_t(vtrn1q_u64(m6, m5), vtrn2q_u64(m5, m1)); |
| 109 | + let r5c = uint64x2x2_t(vextq_u64::<1>(m0, m2), vtrn2q_u64(m3, m7)); |
| 110 | + let r5d = uint64x2x2_t(vtrn2q_u64(m4, m6), vextq_u64::<1>(m2, m7)); |
| 111 | + inner(&mut a, &mut b, &mut c, &mut d, r5a, r5b, r5c, r5d); |
| 112 | + r = match r.checked_sub(1) { |
| 113 | + Some(x) => x, |
| 114 | + None => break 'process, |
| 115 | + }; |
| 116 | + |
| 117 | + // Round #6: |
| 118 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 119 | + // Into: [C 1 E 4 5 F D A 8 0 6 9 B 7 3 2] |
| 120 | + let r6a = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m6, m0), vtrn1q_u64(m7, m2)); |
| 121 | + let r6b = uint64x2x2_t(vtrn2q_u64(m2, m7), vextq_u64::<1>(m6, m5)); |
| 122 | + let r6c = uint64x2x2_t(vtrn1q_u64(m4, m0), vcopyq_laneq_u64::<1, 1>(m3, m4)); |
| 123 | + let r6d = uint64x2x2_t(vtrn2q_u64(m5, m3), vextq_u64::<1>(m1, m1)); |
| 124 | + inner(&mut a, &mut b, &mut c, &mut d, r6a, r6b, r6c, r6d); |
| 125 | + r = match r.checked_sub(1) { |
| 126 | + Some(x) => x, |
| 127 | + None => break 'process, |
| 128 | + }; |
| 129 | + |
| 130 | + // Round #7: |
| 131 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 132 | + // Into: [D 7 C 3 B E 1 9 2 5 F 8 A 0 4 6] |
| 133 | + let r7a = uint64x2x2_t(vtrn2q_u64(m6, m3), vcopyq_laneq_u64::<1, 1>(m6, m1)); |
| 134 | + let r7b = uint64x2x2_t(vextq_u64::<1>(m5, m7), vtrn2q_u64(m0, m4)); |
| 135 | + let r7c = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m1, m2), vextq_u64::<1>(m7, m4)); |
| 136 | + let r7d = uint64x2x2_t(vtrn1q_u64(m5, m0), vtrn1q_u64(m2, m3)); |
| 137 | + inner(&mut a, &mut b, &mut c, &mut d, r7a, r7b, r7c, r7d); |
| 138 | + r = match r.checked_sub(1) { |
| 139 | + Some(x) => x, |
| 140 | + None => break 'process, |
| 141 | + }; |
| 142 | + |
| 143 | + // Round #8: |
| 144 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 145 | + // Into: [6 E B 0 F 9 3 8 A C D 1 5 2 7 4] |
| 146 | + let r8a = uint64x2x2_t(vtrn1q_u64(m3, m7), vextq_u64::<1>(m5, m0)); |
| 147 | + let r8b = uint64x2x2_t(vtrn2q_u64(m7, m4), vextq_u64::<1>(m1, m4)); |
| 148 | + let r8c = uint64x2x2_t(vtrn1q_u64(m5, m6), vtrn2q_u64(m6, m0)); |
| 149 | + let r8d = uint64x2x2_t(vextq_u64::<1>(m2, m1), vextq_u64::<1>(m3, m2)); |
| 150 | + inner(&mut a, &mut b, &mut c, &mut d, r8a, r8b, r8c, r8d); |
| 151 | + r = match r.checked_sub(1) { |
| 152 | + Some(x) => x, |
| 153 | + None => break 'process, |
| 154 | + }; |
| 155 | + |
| 156 | + // Round #9: |
| 157 | + // From: [0 1 2 3 4 5 6 7 8 9 A B C D E F] |
| 158 | + // Into: [A 8 7 1 2 4 6 5 D F 9 3 0 B E C] |
| 159 | + let r9a = uint64x2x2_t(vtrn1q_u64(m5, m4), vtrn2q_u64(m3, m0)); |
| 160 | + let r9b = uint64x2x2_t(vtrn1q_u64(m1, m2), vcopyq_laneq_u64::<1, 1>(m3, m2)); |
| 161 | + let r9c = uint64x2x2_t(vtrn2q_u64(m6, m7), vtrn2q_u64(m4, m1)); |
| 162 | + let r9d = uint64x2x2_t(vcopyq_laneq_u64::<1, 1>(m0, m5), vtrn1q_u64(m7, m6)); |
| 163 | + inner(&mut a, &mut b, &mut c, &mut d, r9a, r9b, r9c, r9d); |
| 164 | + r = match r.checked_sub(1) { |
| 165 | + Some(x) => x, |
| 166 | + None => break 'process, |
| 167 | + }; |
| 168 | + |
| 169 | + loop { |
| 170 | + inner(&mut a, &mut b, &mut c, &mut d, r0a, r0b, r0c, r0d); |
| 171 | + r = match r.checked_sub(1) { |
| 172 | + Some(x) => x, |
| 173 | + None => break 'process, |
| 174 | + }; |
| 175 | + |
| 176 | + inner(&mut a, &mut b, &mut c, &mut d, r1a, r1b, r1c, r1d); |
| 177 | + r = match r.checked_sub(1) { |
| 178 | + Some(x) => x, |
| 179 | + None => break 'process, |
| 180 | + }; |
| 181 | + |
| 182 | + inner(&mut a, &mut b, &mut c, &mut d, r2a, r2b, r2c, r2d); |
| 183 | + r = match r.checked_sub(1) { |
| 184 | + Some(x) => x, |
| 185 | + None => break 'process, |
| 186 | + }; |
| 187 | + |
| 188 | + inner(&mut a, &mut b, &mut c, &mut d, r3a, r3b, r3c, r3d); |
| 189 | + r = match r.checked_sub(1) { |
| 190 | + Some(x) => x, |
| 191 | + None => break 'process, |
| 192 | + }; |
| 193 | + |
| 194 | + inner(&mut a, &mut b, &mut c, &mut d, r4a, r4b, r4c, r4d); |
| 195 | + r = match r.checked_sub(1) { |
| 196 | + Some(x) => x, |
| 197 | + None => break 'process, |
| 198 | + }; |
| 199 | + |
| 200 | + inner(&mut a, &mut b, &mut c, &mut d, r5a, r5b, r5c, r5d); |
| 201 | + r = match r.checked_sub(1) { |
| 202 | + Some(x) => x, |
| 203 | + None => break 'process, |
| 204 | + }; |
| 205 | + |
| 206 | + inner(&mut a, &mut b, &mut c, &mut d, r6a, r6b, r6c, r6d); |
| 207 | + r = match r.checked_sub(1) { |
| 208 | + Some(x) => x, |
| 209 | + None => break 'process, |
| 210 | + }; |
| 211 | + |
| 212 | + inner(&mut a, &mut b, &mut c, &mut d, r7a, r7b, r7c, r7d); |
| 213 | + r = match r.checked_sub(1) { |
| 214 | + Some(x) => x, |
| 215 | + None => break 'process, |
| 216 | + }; |
| 217 | + |
| 218 | + inner(&mut a, &mut b, &mut c, &mut d, r8a, r8b, r8c, r8d); |
| 219 | + r = match r.checked_sub(1) { |
| 220 | + Some(x) => x, |
| 221 | + None => break 'process, |
| 222 | + }; |
| 223 | + |
| 224 | + inner(&mut a, &mut b, &mut c, &mut d, r9a, r9b, r9c, r9d); |
| 225 | + r = match r.checked_sub(1) { |
| 226 | + Some(x) => x, |
| 227 | + None => break 'process, |
| 228 | + }; |
| 229 | + } |
| 230 | + } |
| 231 | + } |
| 232 | + |
| 233 | + // Merge local work vector. |
| 234 | + vst1q_u64_x2( |
| 235 | + h.as_mut_ptr().add(0), |
| 236 | + uint64x2x2_t(veor3q_u64(h0, a.0, c.0), veor3q_u64(h1, a.1, c.1)), |
| 237 | + ); |
| 238 | + vst1q_u64_x2( |
| 239 | + h.as_mut_ptr().add(4), |
| 240 | + uint64x2x2_t(veor3q_u64(h2, b.0, d.0), veor3q_u64(h3, b.1, d.1)), |
| 241 | + ); |
| 242 | + } |
| 243 | +} |
| 244 | + |
| 245 | +#[allow(clippy::too_many_arguments)] |
| 246 | +#[inline(always)] |
| 247 | +fn inner( |
| 248 | + a: &mut uint64x2x2_t, |
| 249 | + b: &mut uint64x2x2_t, |
| 250 | + c: &mut uint64x2x2_t, |
| 251 | + d: &mut uint64x2x2_t, |
| 252 | + d0: uint64x2x2_t, |
| 253 | + d1: uint64x2x2_t, |
| 254 | + d2: uint64x2x2_t, |
| 255 | + d3: uint64x2x2_t, |
| 256 | +) { |
| 257 | + unsafe { |
| 258 | + // G(d0) |
| 259 | + *a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1)); |
| 260 | + *a = uint64x2x2_t(vaddq_u64(a.0, d0.0), vaddq_u64(a.1, d0.1)); |
| 261 | + *d = uint64x2x2_t(vxarq_u64::<32>(d.0, a.0), vxarq_u64::<32>(d.1, a.1)); |
| 262 | + *c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0)); |
| 263 | + *b = uint64x2x2_t(vxarq_u64::<24>(b.0, c.0), vxarq_u64::<24>(b.1, c.1)); |
| 264 | + |
| 265 | + // G(d1) |
| 266 | + *a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1)); |
| 267 | + *a = uint64x2x2_t(vaddq_u64(a.0, d1.0), vaddq_u64(a.1, d1.1)); |
| 268 | + *d = uint64x2x2_t(vxarq_u64::<16>(d.0, a.0), vxarq_u64::<32>(d.1, a.1)); |
| 269 | + *c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0)); |
| 270 | + *b = uint64x2x2_t(vxarq_u64::<63>(b.0, c.0), vxarq_u64::<24>(b.1, c.1)); |
| 271 | + |
| 272 | + // Apply diagonalization. |
| 273 | + *b = uint64x2x2_t(vextq_u64::<1>(a.0, a.1), vextq_u64::<1>(a.1, a.0)); |
| 274 | + *c = uint64x2x2_t(c.1, c.0); |
| 275 | + *d = uint64x2x2_t(vextq_u64::<1>(d.1, d.0), vextq_u64::<1>(d.0, d.1)); |
| 276 | + |
| 277 | + // G(d2) |
| 278 | + *a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1)); |
| 279 | + *a = uint64x2x2_t(vaddq_u64(a.0, d2.0), vaddq_u64(a.1, d2.1)); |
| 280 | + *d = uint64x2x2_t(vxarq_u64::<32>(d.0, a.0), vxarq_u64::<32>(d.1, a.1)); |
| 281 | + *c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0)); |
| 282 | + *b = uint64x2x2_t(vxarq_u64::<24>(b.0, c.0), vxarq_u64::<24>(b.1, c.1)); |
| 283 | + |
| 284 | + // G(d3) |
| 285 | + *a = uint64x2x2_t(vaddq_u64(a.0, b.0), vaddq_u64(a.1, b.1)); |
| 286 | + *a = uint64x2x2_t(vaddq_u64(a.0, d3.0), vaddq_u64(a.1, d3.1)); |
| 287 | + *d = uint64x2x2_t(vxarq_u64::<16>(d.0, a.0), vxarq_u64::<32>(d.1, a.1)); |
| 288 | + *c = uint64x2x2_t(vaddq_u64(c.0, d.0), vaddq_u64(c.0, d.0)); |
| 289 | + *b = uint64x2x2_t(vxarq_u64::<63>(b.0, c.0), vxarq_u64::<24>(b.1, c.1)); |
| 290 | + |
| 291 | + // Revert diagonalization. |
| 292 | + *b = uint64x2x2_t(vextq_u64::<1>(a.1, a.0), vextq_u64::<1>(a.0, a.1)); |
| 293 | + *c = uint64x2x2_t(c.1, c.0); |
| 294 | + *d = uint64x2x2_t(vextq_u64::<1>(d.0, d.1), vextq_u64::<1>(d.1, d.0)); |
| 295 | + } |
| 296 | +} |
0 commit comments