Skip to content

Commit 90df613

Browse files
Allow multiple kmask registers to be allocated and cleanup some codegen around them (#89059)
* Allow multiple kmask registers to be allocated and cleanup some codegen around them * Apply formatting patch * Fix an assert to include TYP_STRUCT * Ensure kmask registers aren't in the default killset * Apply formatting patch * Move the kmask optimizations up to morph * Ensure unique VN for ConvertMaskToVector * Ensure some basic other handling for kmask testing is handled * Improve the implementation for some managed Vector512 code paths * Apply formatting patch * Ensure that the knot intrinsic is inserted into the IR * Apply formatting patch * Ensure the conversion of CompareEqualMask(x, zero) to Test(x, x) doesn't happen for floating-point * Have callee/callerSaveRegs() use an array based lookup * Respond to PR feedback and try to reduce TP regression more * Ensure PTEST doesn't try to handle something utilizing embedded broadcast
1 parent 764f774 commit 90df613

30 files changed

+1472
-584
lines changed

src/coreclr/jit/codegencommon.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,19 @@ CodeGenInterface::CodeGenInterface(Compiler* theCompiler)
6868
{
6969
}
7070

71-
#if defined(TARGET_AMD64)
71+
#if defined(TARGET_XARCH)
7272
void CodeGenInterface::CopyRegisterInfo()
7373
{
74+
#if defined(TARGET_AMD64)
7475
rbmAllFloat = compiler->rbmAllFloat;
7576
rbmFltCalleeTrash = compiler->rbmFltCalleeTrash;
76-
}
7777
#endif // TARGET_AMD64
7878

79+
rbmAllMask = compiler->rbmAllMask;
80+
rbmMskCalleeTrash = compiler->rbmMskCalleeTrash;
81+
}
82+
#endif // TARGET_XARCH
83+
7984
/*****************************************************************************/
8085

8186
CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler)

src/coreclr/jit/codegeninterface.h

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,33 @@ class CodeGenInterface
6363
regMaskTP rbmAllFloat;
6464
regMaskTP rbmFltCalleeTrash;
6565

66-
// Call this function after the equivalent fields in Compiler have been initialized.
67-
void CopyRegisterInfo();
68-
69-
regMaskTP get_RBM_ALLFLOAT() const
66+
FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const
7067
{
7168
return this->rbmAllFloat;
7269
}
73-
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
70+
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
7471
{
7572
return this->rbmFltCalleeTrash;
7673
}
7774
#endif // TARGET_AMD64
7875

76+
#if defined(TARGET_XARCH)
77+
regMaskTP rbmAllMask;
78+
regMaskTP rbmMskCalleeTrash;
79+
80+
// Call this function after the equivalent fields in Compiler have been initialized.
81+
void CopyRegisterInfo();
82+
83+
FORCEINLINE regMaskTP get_RBM_ALLMASK() const
84+
{
85+
return this->rbmAllMask;
86+
}
87+
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
88+
{
89+
return this->rbmMskCalleeTrash;
90+
}
91+
#endif // TARGET_XARCH
92+
7993
// genSpillVar is called by compUpdateLifeVar.
8094
// TODO-Cleanup: We should handle the spill directly in CodeGen, rather than
8195
// calling it from compUpdateLifeVar. Then this can be non-virtual.

src/coreclr/jit/compiler.cpp

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,25 @@ inline bool _our_GetThreadCycles(unsigned __int64* cycleOut)
106106
#endif // which host OS
107107

108108
const BYTE genTypeSizes[] = {
109-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sz,
109+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sz,
110110
#include "typelist.h"
111111
#undef DEF_TP
112112
};
113113

114114
const BYTE genTypeAlignments[] = {
115-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) al,
115+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) al,
116116
#include "typelist.h"
117117
#undef DEF_TP
118118
};
119119

120120
const BYTE genTypeStSzs[] = {
121-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) st,
121+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) st,
122122
#include "typelist.h"
123123
#undef DEF_TP
124124
};
125125

126126
const BYTE genActualTypes[] = {
127-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) jitType,
127+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) jitType,
128128
#include "typelist.h"
129129
#undef DEF_TP
130130
};
@@ -3379,9 +3379,32 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
33793379
rbmFltCalleeTrash |= RBM_HIGHFLOAT;
33803380
cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT;
33813381
}
3382+
#endif // TARGET_AMD64
3383+
3384+
#if defined(TARGET_XARCH)
3385+
rbmAllMask = RBM_ALLMASK_INIT;
3386+
rbmMskCalleeTrash = RBM_MSK_CALLEE_TRASH_INIT;
3387+
cntCalleeTrashMask = CNT_CALLEE_TRASH_MASK_INIT;
3388+
3389+
if (canUseEvexEncoding())
3390+
{
3391+
rbmAllMask |= RBM_ALLMASK_EVEX;
3392+
rbmMskCalleeTrash |= RBM_MSK_CALLEE_TRASH_EVEX;
3393+
cntCalleeTrashMask += CNT_CALLEE_TRASH_MASK_EVEX;
3394+
}
3395+
3396+
// Make sure we copy the register info and initialize the
3397+
// trash regs after the underlying fields are initialized
3398+
3399+
const regMaskTP vtCalleeTrashRegs[TYP_COUNT]{
3400+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr,
3401+
#include "typelist.h"
3402+
#undef DEF_TP
3403+
};
3404+
memcpy(varTypeCalleeTrashRegs, vtCalleeTrashRegs, sizeof(regMaskTP) * TYP_COUNT);
33823405

33833406
codeGen->CopyRegisterInfo();
3384-
#endif // TARGET_AMD64
3407+
#endif // TARGET_XARCH
33853408
}
33863409

33873410
#ifdef DEBUG

src/coreclr/jit/compiler.h

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10953,21 +10953,59 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1095310953
unsigned cntCalleeTrashFloat;
1095410954

1095510955
public:
10956-
regMaskTP get_RBM_ALLFLOAT() const
10956+
FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const
1095710957
{
1095810958
return this->rbmAllFloat;
1095910959
}
10960-
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
10960+
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
1096110961
{
1096210962
return this->rbmFltCalleeTrash;
1096310963
}
10964-
unsigned get_CNT_CALLEE_TRASH_FLOAT() const
10964+
FORCEINLINE unsigned get_CNT_CALLEE_TRASH_FLOAT() const
1096510965
{
1096610966
return this->cntCalleeTrashFloat;
1096710967
}
1096810968

1096910969
#endif // TARGET_AMD64
1097010970

10971+
#if defined(TARGET_XARCH)
10972+
private:
10973+
// The following are for initializing register allocator "constants" defined in targetamd64.h
10974+
// that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which adds
10975+
// 8 mask registers for use.
10976+
//
10977+
// Users of these values need to define four accessor functions:
10978+
//
10979+
// regMaskTP get_RBM_ALLMASK();
10980+
// regMaskTP get_RBM_MSK_CALLEE_TRASH();
10981+
// unsigned get_CNT_CALLEE_TRASH_MASK();
10982+
// unsigned get_AVAILABLE_REG_COUNT();
10983+
//
10984+
// which return the values of these variables.
10985+
//
10986+
// This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only
10987+
// TARGET_XARCH requires one.
10988+
//
10989+
regMaskTP rbmAllMask;
10990+
regMaskTP rbmMskCalleeTrash;
10991+
unsigned cntCalleeTrashMask;
10992+
regMaskTP varTypeCalleeTrashRegs[TYP_COUNT];
10993+
10994+
public:
10995+
FORCEINLINE regMaskTP get_RBM_ALLMASK() const
10996+
{
10997+
return this->rbmAllMask;
10998+
}
10999+
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
11000+
{
11001+
return this->rbmMskCalleeTrash;
11002+
}
11003+
FORCEINLINE unsigned get_CNT_CALLEE_TRASH_MASK() const
11004+
{
11005+
return this->cntCalleeTrashMask;
11006+
}
11007+
#endif // TARGET_XARCH
11008+
1097111009
}; // end of class Compiler
1097211010

1097311011
//---------------------------------------------------------------------------------------------------------------------

src/coreclr/jit/emit.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,13 +590,13 @@ void emitterStats(FILE* fout)
590590
/*****************************************************************************/
591591

592592
const unsigned short emitTypeSizes[] = {
593-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sze,
593+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sze,
594594
#include "typelist.h"
595595
#undef DEF_TP
596596
};
597597

598598
const unsigned short emitTypeActSz[] = {
599-
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) asze,
599+
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) asze,
600600
#include "typelist.h"
601601
#undef DEF_TP
602602
};
@@ -747,6 +747,10 @@ void emitter::emitBegCG(Compiler* comp, COMP_HANDLE cmpHandle)
747747
#if defined(TARGET_AMD64)
748748
rbmFltCalleeTrash = emitComp->rbmFltCalleeTrash;
749749
#endif // TARGET_AMD64
750+
751+
#if defined(TARGET_XARCH)
752+
rbmMskCalleeTrash = emitComp->rbmMskCalleeTrash;
753+
#endif // TARGET_XARCH
750754
}
751755

752756
void emitter::emitEndCG()

src/coreclr/jit/emit.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2305,12 +2305,21 @@ class emitter
23052305
#if defined(TARGET_AMD64)
23062306
regMaskTP rbmFltCalleeTrash;
23072307

2308-
regMaskTP get_RBM_FLT_CALLEE_TRASH() const
2308+
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const
23092309
{
23102310
return this->rbmFltCalleeTrash;
23112311
}
23122312
#endif // TARGET_AMD64
23132313

2314+
#if defined(TARGET_XARCH)
2315+
regMaskTP rbmMskCalleeTrash;
2316+
2317+
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const
2318+
{
2319+
return this->rbmMskCalleeTrash;
2320+
}
2321+
#endif // TARGET_AMD64
2322+
23142323
CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr);
23152324
#if defined(FEATURE_SIMD)
23162325
CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue);

src/coreclr/jit/emitxarch.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6246,12 +6246,25 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size)
62466246
case INS_kmovb_msk:
62476247
case INS_kmovw_msk:
62486248
case INS_kmovd_msk:
6249+
{
6250+
// Zero-extends the source
6251+
hasSideEffect = true;
6252+
break;
6253+
}
6254+
62496255
case INS_kmovq_msk:
6256+
{
6257+
// No side effect, register is 64-bits
6258+
hasSideEffect = false;
6259+
break;
6260+
}
6261+
62506262
case INS_kmovb_gpr:
62516263
case INS_kmovw_gpr:
62526264
case INS_kmovd_gpr:
62536265
case INS_kmovq_gpr:
62546266
{
6267+
// Zero-extends the source
62556268
hasSideEffect = true;
62566269
break;
62576270
}
@@ -6977,7 +6990,7 @@ void emitter::emitIns_R_R_C(instruction ins,
69776990
void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2)
69786991
{
69796992
assert(IsAvx512OrPriorInstruction(ins));
6980-
assert(IsThreeOperandAVXInstruction(ins));
6993+
assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
69816994

69826995
instrDesc* id = emitNewInstr(attr);
69836996
id->idIns(ins);
@@ -11557,7 +11570,7 @@ void emitter::emitDispIns(
1155711570
case IF_RWR_RWR_RRD:
1155811571
{
1155911572
assert(IsVexOrEvexEncodableInstruction(ins));
11560-
assert(IsThreeOperandAVXInstruction(ins));
11573+
assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
1156111574
regNumber reg2 = id->idReg2();
1156211575
regNumber reg3 = id->idReg3();
1156311576
if (ins == INS_bextr || ins == INS_bzhi
@@ -14956,7 +14969,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
1495614969

1495714970
instruction ins = id->idIns();
1495814971
assert(IsVexOrEvexEncodableInstruction(ins));
14959-
assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins));
14972+
assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins));
1496014973
regNumber targetReg = id->idReg1();
1496114974
regNumber src1 = id->idReg2();
1496214975
regNumber src2 = id->idReg3();
@@ -19172,6 +19185,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1917219185
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_6C : PERFSCORE_LATENCY_4C;
1917319186
break;
1917419187

19188+
case INS_vptestmb:
19189+
case INS_vptestmd:
19190+
case INS_vptestmq:
19191+
case INS_vptestmw:
19192+
case INS_vptestnmb:
19193+
case INS_vptestnmd:
19194+
case INS_vptestnmq:
19195+
case INS_vptestnmw:
19196+
{
19197+
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
19198+
result.insLatency += PERFSCORE_LATENCY_4C;
19199+
break;
19200+
}
19201+
1917519202
case INS_mpsadbw:
1917619203
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
1917719204
result.insLatency += PERFSCORE_LATENCY_4C;

0 commit comments

Comments
 (0)