|
1 | | -// +build gc,!purego |
| 1 | +// +build gc,!purego,!noasm |
2 | 2 |
|
3 | 3 | #include "textflag.h" |
4 | 4 |
|
5 | 5 | // Register allocation. |
6 | 6 | #define digest R1 |
7 | | -#define h R2 // Return value. |
8 | | -#define p R3 // Input pointer. |
| 7 | +#define h R2 // Return value. |
| 8 | +#define p R3 // Input pointer. |
9 | 9 | #define len R4 |
10 | | -#define nblocks R5 // len / 32. |
| 10 | +#define nblocks R5 // len / 32. |
11 | 11 | #define prime1 R7 |
12 | 12 | #define prime2 R8 |
13 | 13 | #define prime3 R9 |
|
22 | 22 | #define x3 R22 |
23 | 23 | #define x4 R23 |
24 | 24 |
|
25 | | -#define round(acc, x) \ |
26 | | - MADD prime2, acc, x, acc \ |
27 | | - ROR $64-31, acc \ |
28 | | - MUL prime1, acc \ |
| 25 | +#define round(acc, x) \ |
| 26 | + MADD prime2, acc, x, acc \ |
| 27 | + ROR $64-31, acc \ |
| 28 | + MUL prime1, acc \ |
29 | 29 |
|
30 | 30 | // x = round(0, x). |
31 | | -#define round0(x) \ |
32 | | - MUL prime2, x \ |
33 | | - ROR $64-31, x \ |
34 | | - MUL prime1, x \ |
| 31 | +#define round0(x) \ |
| 32 | + MUL prime2, x \ |
| 33 | + ROR $64-31, x \ |
| 34 | + MUL prime1, x \ |
35 | 35 |
|
36 | | -#define mergeRound(x) \ |
37 | | - round0(x) \ |
38 | | - EOR x, h \ |
39 | | - MADD h, prime4, prime1, h \ |
| 36 | +#define mergeRound(x) \ |
| 37 | + round0(x) \ |
| 38 | + EOR x, h \ |
| 39 | + MADD h, prime4, prime1, h \ |
40 | 40 |
|
41 | 41 | // Update v[1-4] with 32-byte blocks. Assumes len >= 32. |
42 | | -#define blocksLoop() \ |
43 | | - LSR $5, len, nblocks \ |
44 | | - PCALIGN $16 \ |
45 | | -loop: \ |
46 | | - LDP.P 32(p), (x1, x2) \ |
47 | | - round(v1, x1) \ |
48 | | - LDP -16(p), (x3, x4) \ |
49 | | - round(v2, x2) \ |
50 | | - SUB $1, nblocks \ |
51 | | - round(v3, x3) \ |
52 | | - round(v4, x4) \ |
53 | | - CBNZ nblocks, loop \ |
54 | | - |
| 42 | +#define blocksLoop() \ |
| 43 | + LSR $5, len, nblocks \ |
| 44 | + PCALIGN $16 \ |
| 45 | + loop: \ |
| 46 | + LDP.P 32(p), (x1, x2) \ |
| 47 | + round(v1, x1) \ |
| 48 | + LDP -16(p), (x3, x4) \ |
| 49 | + round(v2, x2) \ |
| 50 | + SUB $1, nblocks \ |
| 51 | + round(v3, x3) \ |
| 52 | + round(v4, x4) \ |
| 53 | + CBNZ nblocks, loop \ |
55 | 54 |
|
56 | 55 | // The primes are repeated here to ensure that they're stored |
57 | 56 | // in a contiguous array, so we can load them with LDP. |
58 | | -DATA primes<> +0(SB)/8, $11400714785074694791 |
59 | | -DATA primes<> +8(SB)/8, $14029467366897019727 |
60 | | -DATA primes<>+16(SB)/8, $1609587929392839161 |
61 | | -DATA primes<>+24(SB)/8, $9650029242287828579 |
62 | | -DATA primes<>+32(SB)/8, $2870177450012600261 |
| 57 | +DATA primes<> +0(SB)/8, $11400714785074694791 |
| 58 | +DATA primes<> +8(SB)/8, $14029467366897019727 |
| 59 | +DATA primes<>+16(SB)/8, $1609587929392839161 |
| 60 | +DATA primes<>+24(SB)/8, $9650029242287828579 |
| 61 | +DATA primes<>+32(SB)/8, $2870177450012600261 |
63 | 62 | GLOBL primes<>(SB), NOPTR+RODATA, $40 |
64 | 63 |
|
65 | | - |
66 | 64 | // func Sum64(b []byte) uint64 |
67 | 65 | TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 |
68 | | - LDP b_base+0(FP), (p, len) |
| 66 | + LDP b_base+0(FP), (p, len) |
69 | 67 |
|
70 | 68 | LDP primes<> +0(SB), (prime1, prime2) |
71 | 69 | LDP primes<>+16(SB), (prime3, prime4) |
@@ -156,32 +154,31 @@ try1: |
156 | 154 |
|
157 | 155 | end: |
158 | 156 | EOR h >> 33, h |
159 | | - MUL prime2, h |
| 157 | + MUL prime2, h |
160 | 158 | EOR h >> 29, h |
161 | | - MUL prime3, h |
| 159 | + MUL prime3, h |
162 | 160 | EOR h >> 32, h |
163 | 161 |
|
164 | 162 | MOVD h, ret+24(FP) |
165 | 163 | RET |
166 | 164 |
|
167 | | - |
168 | 165 | // func writeBlocks(d *Digest, b []byte) int |
169 | 166 | // |
170 | 167 | // Assumes len(b) >= 32. |
171 | 168 | TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 |
172 | | - LDP primes<>(SB), (prime1, prime2) |
| 169 | + LDP primes<>(SB), (prime1, prime2) |
173 | 170 |
|
174 | 171 | // Load state. Assume v[1-4] are stored contiguously. |
175 | 172 | MOVD d+0(FP), digest |
176 | | - LDP 0(digest), (v1, v2) |
| 173 | + LDP 0(digest), (v1, v2) |
177 | 174 | LDP 16(digest), (v3, v4) |
178 | 175 |
|
179 | 176 | LDP b_base+8(FP), (p, len) |
180 | 177 |
|
181 | 178 | blocksLoop() |
182 | 179 |
|
183 | 180 | // Store updated state. |
184 | | - STP (v1, v2), 0(digest) |
| 181 | + STP (v1, v2), 0(digest) |
185 | 182 | STP (v3, v4), 16(digest) |
186 | 183 |
|
187 | 184 | BIC $31, len |
|
0 commit comments