Skip to content

Commit e529fa1

Browse files
harshavardhanafwessels
authored andcommitted
Support SHA* intrinsics on Intel CPU (#37)
* Support SHA* intrinsics on Intel CPU - optimise: select block function at init - added dedicated padding function, optimised endian conversion - add assembly for Intel SHA extensions - update benchmarks - stream line checksum function - cleanup of sha assembly code * Cleanup code to be idiomatic Go
1 parent 5197645 commit e529fa1

22 files changed

+2818
-2230
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.test

.travis.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ os:
77

88
go:
99
- tip
10-
- 1.11
11-
- 1.10
10+
- 1.11.x
1211

1312
env:
1413
- ARCH=x86_64

cpuid.go

Lines changed: 86 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -16,78 +16,104 @@
1616
package sha256
1717

1818
// True when SIMD instructions are available.
19-
var avx512 = haveAVX512()
20-
var avx2 = haveAVX2()
21-
var avx = haveAVX()
22-
var ssse3 = haveSSSE3()
19+
var avx512 bool
20+
var avx2 bool
21+
var avx bool
22+
var sse bool
23+
var sse2 bool
24+
var sse3 bool
25+
var ssse3 bool
26+
var sse41 bool
27+
var sse42 bool
28+
var popcnt bool
29+
var sha bool
2330
var armSha = haveArmSha()
2431

25-
// haveAVX returns true when there is AVX support
26-
func haveAVX() bool {
27-
_, _, c, _ := cpuid(1)
32+
func init() {
33+
var _xsave bool
34+
var _osxsave bool
35+
var _avx bool
36+
var _avx2 bool
37+
var _avx512f bool
38+
var _avx512dq bool
39+
// var _avx512pf bool
40+
// var _avx512er bool
41+
// var _avx512cd bool
42+
var _avx512bw bool
43+
var _avx512vl bool
44+
var _sseState bool
45+
var _avxState bool
46+
var _opmaskState bool
47+
var _zmmHI256State bool
48+
var _hi16ZmmState bool
2849

29-
// Check XGETBV, OXSAVE and AVX bits
30-
if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
31-
// Check for OS support
32-
eax, _ := xgetbv(0)
33-
return (eax & 0x6) == 0x6
34-
}
35-
return false
36-
}
37-
38-
// haveAVX2 returns true when there is AVX2 support
39-
func haveAVX2() bool {
4050
mfi, _, _, _ := cpuid(0)
4151

42-
// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
43-
if mfi >= 7 && haveAVX() {
44-
_, ebx, _, _ := cpuidex(7, 0)
45-
return (ebx & 0x00000020) != 0
46-
}
47-
return false
48-
}
52+
if mfi >= 1 {
53+
_, _, c, d := cpuid(1)
4954

50-
// haveAVX512 returns true when there is AVX512 support
51-
func haveAVX512() bool {
52-
mfi, _, _, _ := cpuid(0)
55+
sse = (d & (1 << 25)) != 0
56+
sse2 = (d & (1 << 26)) != 0
57+
sse3 = (c & (1 << 0)) != 0
58+
ssse3 = (c & (1 << 9)) != 0
59+
sse41 = (c & (1 << 19)) != 0
60+
sse42 = (c & (1 << 20)) != 0
61+
popcnt = (c & (1 << 23)) != 0
62+
_xsave = (c & (1 << 26)) != 0
63+
_osxsave = (c & (1 << 27)) != 0
64+
_avx = (c & (1 << 28)) != 0
65+
}
5366

54-
// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
5567
if mfi >= 7 {
56-
_, _, c, _ := cpuid(1)
68+
_, b, _, _ := cpuid(7)
5769

58-
// Only detect AVX-512 features if XGETBV is supported
59-
if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
60-
// Check for OS support
61-
eax, _ := xgetbv(0)
62-
_, ebx, _, _ := cpuidex(7, 0)
70+
_avx2 = (b & (1 << 5)) != 0
71+
_avx512f = (b & (1 << 16)) != 0
72+
_avx512dq = (b & (1 << 17)) != 0
73+
// _avx512pf = (b & (1 << 26)) != 0
74+
// _avx512er = (b & (1 << 27)) != 0
75+
// _avx512cd = (b & (1 << 28)) != 0
76+
_avx512bw = (b & (1 << 30)) != 0
77+
_avx512vl = (b & (1 << 31)) != 0
78+
sha = (b & (1 << 29)) != 0
79+
}
6380

64-
// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
65-
// ZMM16-ZMM31 state are enabled by OS)
66-
/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
67-
if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
68-
if ebx&(1<<16) == 0 {
69-
return false // no AVX512F
70-
}
71-
if ebx&(1<<17) == 0 {
72-
return false // no AVX512DQ
73-
}
74-
if ebx&(1<<30) == 0 {
75-
return false // no AVX512BW
76-
}
77-
if ebx&(1<<31) == 0 {
78-
return false // no AVX512VL
79-
}
80-
return true
81-
}
82-
}
81+
// Stop here if XSAVE unsupported or not enabled
82+
if !_xsave || !_osxsave {
83+
return
8384
}
84-
return false
85-
}
8685

87-
// haveSSSE3 returns true when there is SSSE3 support
88-
func haveSSSE3() bool {
86+
if _xsave && _osxsave {
87+
a, _ := xgetbv(0)
88+
89+
_sseState = (a & (1 << 1)) != 0
90+
_avxState = (a & (1 << 2)) != 0
91+
_opmaskState = (a & (1 << 5)) != 0
92+
_zmmHI256State = (a & (1 << 6)) != 0
93+
_hi16ZmmState = (a & (1 << 7)) != 0
94+
} else {
95+
_sseState = true
96+
}
8997

90-
_, _, c, _ := cpuid(1)
98+
// Very unlikely that OS would enable XSAVE and then disable SSE
99+
if !_sseState {
100+
sse = false
101+
sse2 = false
102+
sse3 = false
103+
ssse3 = false
104+
sse41 = false
105+
sse42 = false
106+
}
91107

92-
return (c & 0x00000200) != 0
108+
if _avxState {
109+
avx = _avx
110+
avx2 = _avx2
111+
}
112+
113+
if _opmaskState && _zmmHI256State && _hi16ZmmState {
114+
avx512 = (_avx512f &&
115+
_avx512dq &&
116+
_avx512bw &&
117+
_avx512vl)
118+
}
93119
}

cpuid_386.s

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,30 +24,30 @@
2424

2525
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
2626
TEXT ·cpuid(SB), 7, $0
27-
XORL CX, CX
28-
MOVL op+0(FP), AX
29-
CPUID
30-
MOVL AX, eax+4(FP)
31-
MOVL BX, ebx+8(FP)
32-
MOVL CX, ecx+12(FP)
33-
MOVL DX, edx+16(FP)
34-
RET
27+
XORL CX, CX
28+
MOVL op+0(FP), AX
29+
CPUID
30+
MOVL AX, eax+4(FP)
31+
MOVL BX, ebx+8(FP)
32+
MOVL CX, ecx+12(FP)
33+
MOVL DX, edx+16(FP)
34+
RET
3535

3636
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
3737
TEXT ·cpuidex(SB), 7, $0
38-
MOVL op+0(FP), AX
39-
MOVL op2+4(FP), CX
40-
CPUID
41-
MOVL AX, eax+8(FP)
42-
MOVL BX, ebx+12(FP)
43-
MOVL CX, ecx+16(FP)
44-
MOVL DX, edx+20(FP)
45-
RET
38+
MOVL op+0(FP), AX
39+
MOVL op2+4(FP), CX
40+
CPUID
41+
MOVL AX, eax+8(FP)
42+
MOVL BX, ebx+12(FP)
43+
MOVL CX, ecx+16(FP)
44+
MOVL DX, edx+20(FP)
45+
RET
4646

4747
// func xgetbv(index uint32) (eax, edx uint32)
4848
TEXT ·xgetbv(SB), 7, $0
49-
MOVL index+0(FP), CX
50-
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
51-
MOVL AX, eax+4(FP)
52-
MOVL DX, edx+8(FP)
53-
RET
49+
MOVL index+0(FP), CX
50+
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
51+
MOVL AX, eax+4(FP)
52+
MOVL DX, edx+8(FP)
53+
RET

cpuid_amd64.s

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,31 +24,30 @@
2424

2525
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
2626
TEXT ·cpuid(SB), 7, $0
27-
XORQ CX, CX
28-
MOVL op+0(FP), AX
29-
CPUID
30-
MOVL AX, eax+8(FP)
31-
MOVL BX, ebx+12(FP)
32-
MOVL CX, ecx+16(FP)
33-
MOVL DX, edx+20(FP)
34-
RET
35-
27+
XORQ CX, CX
28+
MOVL op+0(FP), AX
29+
CPUID
30+
MOVL AX, eax+8(FP)
31+
MOVL BX, ebx+12(FP)
32+
MOVL CX, ecx+16(FP)
33+
MOVL DX, edx+20(FP)
34+
RET
3635

3736
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
3837
TEXT ·cpuidex(SB), 7, $0
39-
MOVL op+0(FP), AX
40-
MOVL op2+4(FP), CX
41-
CPUID
42-
MOVL AX, eax+8(FP)
43-
MOVL BX, ebx+12(FP)
44-
MOVL CX, ecx+16(FP)
45-
MOVL DX, edx+20(FP)
46-
RET
38+
MOVL op+0(FP), AX
39+
MOVL op2+4(FP), CX
40+
CPUID
41+
MOVL AX, eax+8(FP)
42+
MOVL BX, ebx+12(FP)
43+
MOVL CX, ecx+16(FP)
44+
MOVL DX, edx+20(FP)
45+
RET
4746

4847
// func xgetbv(index uint32) (eax, edx uint32)
4948
TEXT ·xgetbv(SB), 7, $0
50-
MOVL index+0(FP), CX
51-
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
52-
MOVL AX, eax+8(FP)
53-
MOVL DX, edx+12(FP)
54-
RET
49+
MOVL index+0(FP), CX
50+
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
51+
MOVL AX, eax+8(FP)
52+
MOVL DX, edx+12(FP)
53+
RET

0 commit comments

Comments
 (0)