-
Notifications
You must be signed in to change notification settings - Fork 5.2k
JIT: Accelerate more casts on x86 #116805
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
91116c7
f65daa7
f23c534
3af2fd6
3f5e124
12d8590
4aef2f7
36ef0ac
07f7e81
26b17de
6242c92
30f93d3
2af2b44
d786deb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -138,8 +138,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) | |
| } | ||
|
|
||
| #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) | ||
| if (!tree->TypeIs(TYP_LONG) && | ||
| !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) | ||
| // On x86, long->floating casts are implemented in DecomposeCast. | ||
| bool isLongToFloatingCast = | ||
| (tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)); | ||
saucecontrol marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if (!tree->TypeIs(TYP_LONG) && !isLongToFloatingCast) | ||
| #else | ||
| if (!tree->TypeIs(TYP_LONG)) | ||
| #endif // FEATURE_HW_INTRINSICS && TARGET_X86 | ||
|
|
@@ -159,6 +162,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) | |
| // HWIntrinsics can consume/produce a long directly, provided its source/target is memory. | ||
| // Here we do a conservative check for specific cases where it is certain the load/store | ||
| // can be contained. In those cases, we can skip decomposition. | ||
| // | ||
| // We also look for longs consumed directly by a long->floating cast. These can skip | ||
| // decomposition because the cast is implemented using HWIntrinsics. | ||
|
|
||
| GenTree* user = use.User(); | ||
|
|
||
|
|
@@ -582,44 +588,213 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) | |
| } | ||
|
|
||
| #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) | ||
| if (varTypeIsFloating(dstType)) | ||
| if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType)) | ||
| { | ||
| // We will reach this path only if morph did not convert the cast to a helper call, | ||
| // meaning we can perform the cast using SIMD instructions. | ||
| // The sequence this creates is simply: | ||
| // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() | ||
|
|
||
| NamedIntrinsic intrinsicId = NI_Illegal; | ||
| GenTree* srcOp = cast->CastOp(); | ||
| var_types dstType = cast->CastToType(); | ||
| CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; | ||
| CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; | ||
|
|
||
| assert(!cast->gtOverflow()); | ||
| assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512)); | ||
|
|
||
| intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; | ||
| GenTree* srcOp = cast->CastOp(); | ||
| GenTree* castResult = nullptr; | ||
| LIR::Range castRange = LIR::EmptyRange(); | ||
| CorInfoType srcBaseType = CORINFO_TYPE_UNDEF; | ||
| CorInfoType dstBaseType = CORINFO_TYPE_UNDEF; | ||
|
|
||
| GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16); | ||
| GenTree* convert = | ||
| m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); | ||
| GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); | ||
| if (varTypeIsFloating(srcType)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we know if the compiler is CSEing this check with the above check as expected? -- Asking since manually caching might be a way to win some throughput back. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not for sure, but I generally assume C++ compilers will handle 'obvious' ones like this. It should be noted, the throughput hit to x86 directly correlates with the number of casts that are now inlined. i.e. the only significant throughput hit is on the coreclr_tests collection
which is also the one that had the most casts in it
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍. The biggest concern is the TP hit to minopts. It may be desirable to leave that using the helper there so that floating-point heavy code doesn't start up slower. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think #117512 will reduce the hit a bit.
This is interesting, because the same argument could apply to the complicated saturating logic that we have for x64 as well. #97529 introduced a similar throughput regression, and although it was done for correctness instead of perf, the throughput hit could have been avoided by using the helper in minopts there too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
AFAIR, the JIT throughput hit there ended up being very minimal (and often an improvement). It was the perf score and code output size that regressed, which was expected. If there was a significant perf score hit to minopts, then yes the same would apply here and it would likely be beneficial to ensure that is doing the "better" thing as well. |
||
| { | ||
| srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; | ||
| dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; | ||
| } | ||
| else | ||
| { | ||
| srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; | ||
| dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; | ||
| } | ||
|
|
||
| Range().InsertAfter(cast, createScalar, convert, toScalar); | ||
| Range().Remove(cast); | ||
| // This creates the equivalent of the following C# code: | ||
| // var srcVec = Vector128.CreateScalarUnsafe(castOp); | ||
|
|
||
| if (createScalar->IsCnsVec()) | ||
| GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcBaseType, 16); | ||
| castRange.InsertAtEnd(srcVector); | ||
|
|
||
| if (srcVector->IsCnsVec()) | ||
| { | ||
| Range().Remove(srcOp); | ||
| } | ||
|
|
||
| if (varTypeIsFloating(dstType)) | ||
| { | ||
| // long->floating casts don't require any kind of fixup. We simply use the vector | ||
| // form of the instructions, because the scalar form is not supported on 32-bit. | ||
|
|
||
| NamedIntrinsic intrinsicId = | ||
| (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; | ||
|
|
||
| castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16); | ||
| } | ||
| else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) | ||
| { | ||
| // Likewise, the AVX10.2 saturating floating->long instructions give the correct result, | ||
| // but we have to use the vector form. | ||
|
|
||
| NamedIntrinsic intrinsicId = (dstType == TYP_ULONG) | ||
| ? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation | ||
| : NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation; | ||
|
|
||
| castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16); | ||
| } | ||
| else if (dstType == TYP_ULONG) | ||
| { | ||
| // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so | ||
| // we only need to fix up negative or NaN values before conversion. | ||
| // | ||
| // maxs[sd] will take the value from the second operand if the first operand's value is | ||
| // NaN, which allows us to fix up both negative and NaN values with a single instruction. | ||
| // | ||
| // This creates the equivalent of the following C# code: | ||
| // var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero); | ||
| // castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal); | ||
|
|
||
| GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); | ||
| GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, | ||
| srcBaseType, 16); | ||
saucecontrol marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| castRange.InsertAtEnd(zero); | ||
| castRange.InsertAtEnd(fixupVal); | ||
|
|
||
| castResult = | ||
| m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, | ||
| NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16); | ||
| } | ||
| else | ||
| { | ||
| assert(dstType == TYP_LONG); | ||
|
|
||
| // The logic for floating->signed long casts is similar to the AVX-512 implementation | ||
| // in LowerCast, except that all operations must be done in SIMD registers. | ||
|
|
||
| if (srcType == TYP_FLOAT) | ||
| { | ||
| // For float->long, the result will be twice as wide as the input. Broadcasting the | ||
| // input allows us to use two adjacent elements when creating the fixup mask later. | ||
|
|
||
| srcVector = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, | ||
| NI_AVX2_BroadcastScalarToVector128, srcBaseType, 16); | ||
saucecontrol marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| castRange.InsertAtEnd(srcVector); | ||
| } | ||
|
|
||
| // We will use the input value multiple times, so we replace it with a lclVar. | ||
| LIR::Use srcUse; | ||
| LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse); | ||
| srcUse.ReplaceWithLclVar(m_compiler); | ||
| srcVector = srcUse.Def(); | ||
|
|
||
| // Fix up NaN values before conversion. Saturation is handled after conversion, | ||
| // because MaxValue is not precisely representable in the floating format. | ||
| // | ||
| // This creates the equivalent of the following C# code: | ||
| // var nanMask = Sse.CompareScalarOrdered(srcVec, srcVec); | ||
| // var fixupVal = Sse.And(srcVec, nanMask); | ||
saucecontrol marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // convertResult = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(fixupVal); | ||
|
|
||
| GenTree* srcClone = m_compiler->gtClone(srcVector); | ||
| GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, srcClone, | ||
| NI_X86Base_CompareScalarOrdered, srcBaseType, 16); | ||
|
|
||
| castRange.InsertAtEnd(srcClone); | ||
| castRange.InsertAtEnd(nanMask); | ||
|
|
||
| srcClone = m_compiler->gtClone(srcVector); | ||
| GenTree* fixupVal = m_compiler->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16); | ||
|
|
||
| castRange.InsertAtEnd(srcClone); | ||
| castRange.InsertAtEnd(fixupVal); | ||
|
|
||
| GenTree* convertResult = | ||
| m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, | ||
| NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16); | ||
|
|
||
| castRange.InsertAtEnd(convertResult); | ||
|
|
||
| // Now we handle saturation of the result for positive overflow. | ||
| // | ||
| // This creates the equivalent of the following C# code: | ||
| // var maxFloatingValue = Vector128.Create(9223372036854775808.0); | ||
| // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; | ||
| // var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode); | ||
|
|
||
| NamedIntrinsic compareIntrinsic = (srcType == TYP_FLOAT) ? NI_AVX_Compare : NI_AVX_CompareScalar; | ||
| GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16); | ||
|
|
||
| if (srcType == TYP_FLOAT) | ||
| { | ||
| // For float->long, we broadcast the comparison value, same as we broadcast the input. | ||
| for (uint32_t index = 0; index < 4; index++) | ||
| { | ||
| maxFloatingValue->gtSimdVal.f32[index] = 9223372036854775808.0f; | ||
| } | ||
| } | ||
| else | ||
| { | ||
| maxFloatingValue->gtSimdVal.f64[0] = 9223372036854775808.0; | ||
| } | ||
saucecontrol marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| castRange.InsertAtEnd(maxFloatingValue); | ||
|
|
||
| srcClone = m_compiler->gtClone(srcVector); | ||
| GenTree* compareMode = m_compiler->gtNewIconNode( | ||
| static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling)); | ||
| GenTree* compareMax = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, maxFloatingValue, | ||
| compareMode, compareIntrinsic, srcBaseType, 16); | ||
|
|
||
| castRange.InsertAtEnd(srcClone); | ||
| castRange.InsertAtEnd(compareMode); | ||
| castRange.InsertAtEnd(compareMax); | ||
|
|
||
| // We will use the compare mask multiple times, so we replace it with a lclVar. | ||
| LIR::Use cmpUse; | ||
| LIR::Use::MakeDummyUse(castRange, compareMax, &cmpUse); | ||
| cmpUse.ReplaceWithLclVar(m_compiler); | ||
| compareMax = cmpUse.Def(); | ||
|
|
||
| // Mask in long.MaxValue for positive saturation. In the case of overflow, the compare | ||
| // mask will be all ones. We shift that value right by one to create the MaxValue vector. | ||
| // This is where we treat two adjacent elements from a float compare as one 64-bit mask. | ||
| // | ||
| // This creates the equivalent of the following C# code: | ||
| // var maxLong = Sse2.ShiftRightLogical(compareMax, 1); | ||
| // castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult); | ||
|
|
||
| GenTree* cmpClone = m_compiler->gtClone(compareMax); | ||
| GenTree* one = m_compiler->gtNewIconNode(1); | ||
| GenTree* maxLong = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, compareMax, one, | ||
| NI_X86Base_ShiftRightLogical, dstBaseType, 16); | ||
saucecontrol marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| castRange.InsertAtEnd(one); | ||
| castRange.InsertAtEnd(maxLong); | ||
| castRange.InsertAtEnd(cmpClone); | ||
|
|
||
| castResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, cmpClone, maxLong, convertResult, dstBaseType, 16); | ||
| } | ||
|
|
||
| // Because the results are in a SIMD register, we need to ToScalar() them out. | ||
| GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstBaseType, 16); | ||
|
|
||
| castRange.InsertAtEnd(castResult); | ||
| castRange.InsertAtEnd(toScalar); | ||
|
|
||
| Range().InsertAfter(cast, std::move(castRange)); | ||
| Range().Remove(cast); | ||
|
|
||
| if (use.IsDummyUse()) | ||
| { | ||
| toScalar->SetUnusedValue(); | ||
| } | ||
| use.ReplaceWith(toScalar); | ||
|
|
||
| return toScalar->gtNext; | ||
| return toScalar; | ||
| } | ||
| #endif // FEATURE_HW_INTRINSICS && TARGET_X86 | ||
|
|
||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.