From 682e28f78c0de5a0e29d94e5b7758ffec740188c Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Oct 2022 10:41:04 -0700 Subject: [PATCH 1/5] Accelerates ConverToDouble for ulong/long for Vector. --- src/coreclr/jit/emitxarch.cpp | 18 +++++++---- src/coreclr/jit/emitxarch.h | 5 +++ src/coreclr/jit/hwintrinsiclistxarch.h | 22 ++++++++++++-- src/coreclr/jit/hwintrinsicxarch.cpp | 13 ++++++++ src/coreclr/jit/instrsxarch.h | 8 +++++ src/coreclr/jit/lowerxarch.cpp | 4 ++- src/coreclr/jit/morph.cpp | 42 ++++++++++++++++++++++++++ src/coreclr/jit/simdashwintrinsic.cpp | 13 ++++++-- 8 files changed, 114 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 4272ae718e935b..af1dd224594eed 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -37,7 +37,7 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) bool emitter::IsSSEOrAVXorAVX512Instruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); + return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); } bool emitter::IsAVXOnlyInstruction(instruction ins) @@ -47,7 +47,7 @@ bool emitter::IsAVXOnlyInstruction(instruction ins) bool emitter::IsAVX512OnlyInstruction(instruction ins) { - return false; // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. + return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); } bool emitter::IsFMAInstruction(instruction ins) @@ -13073,10 +13073,6 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) regNumber reg1 = id->idReg1(); regNumber reg2 = id->idReg2(); emitAttr size = id->idOpSize(); - if ((ins == INS_movq) || (ins == INS_movd)) - { - emitDispIns(id, false, false, false); - } if (IsSSEOrAVXorAVX512Instruction(ins)) { @@ -17185,6 +17181,16 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } #endif + + case INS_vcvtqq2pd: + case INS_vcvtuqq2pd: + { + // TODO-XARCH-AVX512: fill these proper + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + break; + } + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 15073480240212..663703b9e4a00c 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -265,9 +265,14 @@ bool IsWEvexOpcodeExtension(instruction ins) case INS_vfnmsub231sd: case INS_unpcklpd: case INS_vpermilpdvar: + + // New AVX512 ins + case INS_vcvtqq2pd: + case INS_vcvtuqq2pd: { return true; // W1 } + case INS_movd: case INS_punpckldq: case INS_movntdq: diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 0fff3a98d3bec3..d6bb08fc340066 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -813,12 +813,30 @@ HARDWARE_INTRINSIC(POPCNT, PopCount, // POPCNT Intrinsics HARDWARE_INTRINSIC(POPCNT_X64, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// X86Serialize Intrinsics +HARDWARE_INTRINSIC(X86Serialize, Serialize, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// X86Serialize Intrinsics -HARDWARE_INTRINSIC(X86Serialize, Serialize, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +// AVX512F Intrinsics + +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512F_VL Intrinsics + +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512DQ VL Intrinsics +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector128Double, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256Double, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index e7a0ab3cda1c63..2700f858ad81e6 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -848,6 +848,19 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ConvertToDouble: case NI_Vector256_ConvertToDouble: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_LONG || simdBaseType == TYP_ULONG); + + intrinsic = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector128Double + : NI_AVX512DQ_VL_ConvertToVector128Double; + + op1 = impSIMDPopStack(retType); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + + break; + } + case NI_Vector128_ConvertToInt64: case NI_Vector256_ConvertToInt64: case NI_Vector128_ConvertToUInt32: diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index e766df67304d8b..250acb7e570865 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -615,6 +615,14 @@ INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) + +INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0xE6), INS_FLAGS_None) // cvt packed quad word to double +INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double + + +INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) + // Scalar instructions in SSE4.2 INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 3ef7660e6fa115..ac072323cc10b6 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -6370,7 +6370,9 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) (intrinsicId == NI_AVX2_BroadcastScalarToVector256) || (intrinsicId == NI_AVX2_ConvertToVector256Int16) || (intrinsicId == NI_AVX2_ConvertToVector256Int32) || - (intrinsicId == NI_AVX2_ConvertToVector256Int64)); + (intrinsicId == NI_AVX2_ConvertToVector256Int64) || + (intrinsicId == NI_AVX512DQ_VL_ConvertToVector128Double) || + (intrinsicId == NI_AVX512DQ_VL_ConvertToVector256Double)); ContainCheckHWIntrinsicAddr(node, op1); } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 3faccdbfa9f9cb..687736a4cb2001 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -280,6 +280,38 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) var_types dstType = tree->CastToType(); unsigned dstSize = genTypeSize(dstType); +/* + // See if the cast can be contracted into a single optimized cast +#if defined(TARGET_AMD64) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + if (oper->OperIs(GT_CAST)) + { + GenTreeCast *innerCast = static_cast(oper); + GenTree* innerOper = innerCast->CastOp(); + var_types innerSrcType = genActualType(innerOper); + var_types innerDstType = innerCast->CastToType(); + unsigned innerDstSize = genTypeSize(innerDstType); + + if (innerCast->IsUnsigned()) + { + innerSrcType = varTypeToUnsigned(innerSrcType); + + if (innerSrcType == TYP_UINT) + { + if (dstType == TYP_FLOAT && innerDstType == TYP_DOUBLE) + { + // One optimized cast here + tree = gtNewCastNode(TYP_UINT, innerOper, true, TYP_FLOAT); + return fgMorphTree(tree); + } + } + } + } + } +#endif +*/ + // See if the cast has to be done in two steps. R -> I if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType)) { @@ -454,10 +486,20 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) } else if (srcType == TYP_UINT) { +#if defined(TARGET_AMD64) + if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { +#endif oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); tree->ClearUnsigned(); tree->CastOp() = oper; +#if defined(TARGET_AMD64) + } +#endif + + + } } #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index ac19a94ade999a..ebc72fc7f1a8ef 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -408,8 +408,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { #if defined(TARGET_XARCH) - case NI_VectorT128_ConvertToDouble: - case NI_VectorT256_ConvertToDouble: case NI_VectorT128_ConvertToInt64: case NI_VectorT256_ConvertToInt64: case NI_VectorT128_ConvertToUInt32: @@ -802,6 +800,17 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_ConvertToDouble: + case NI_VectorT256_ConvertToDouble: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_LONG || simdBaseType == TYP_ULONG); + NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector128Double + : NI_AVX512DQ_VL_ConvertToVector128Double; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_ConvertToInt32: case NI_VectorT256_ConvertToInt32: { From 929229dd8900a54c97f7de4c6ec83b5f18a042d5 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Oct 2022 11:12:55 -0700 Subject: [PATCH 2/5] Accelerates ConvertToInt64 for double for Vector. --- src/coreclr/jit/emitxarch.cpp | 1 + src/coreclr/jit/emitxarch.h | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 3 +++ src/coreclr/jit/hwintrinsicxarch.cpp | 15 ++++++++++++++- src/coreclr/jit/instrsxarch.h | 1 + src/coreclr/jit/simdashwintrinsic.cpp | 15 ++++++++++++--- 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index af1dd224594eed..c19d48ceb4f015 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -17184,6 +17184,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtqq2pd: case INS_vcvtuqq2pd: + case INS_vcvtpd2qq: { // TODO-XARCH-AVX512: fill these proper result.insLatency += PERFSCORE_LATENCY_1C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 663703b9e4a00c..287015b6c59f37 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -269,6 +269,7 @@ bool IsWEvexOpcodeExtension(instruction ins) // New AVX512 ins case INS_vcvtqq2pd: case INS_vcvtuqq2pd: + case INS_vcvtpd2qq: { return true; // W1 } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index d6bb08fc340066..2ec3c5d95a7571 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -837,6 +837,9 @@ HARDWARE_INTRINSIC(X86Serialize, Serialize, HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector128Double, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256Double, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector128Int64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256Int64, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 2700f858ad81e6..310a638729c816 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -852,7 +852,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_LONG || simdBaseType == TYP_ULONG); - intrinsic = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector128Double + intrinsic = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Double : NI_AVX512DQ_VL_ConvertToVector128Double; op1 = impSIMDPopStack(retType); @@ -863,6 +863,19 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ConvertToInt64: case NI_Vector256_ConvertToInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + + intrinsic = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Int64 + : NI_AVX512DQ_VL_ConvertToVector128Int64; + + op1 = impSIMDPopStack(retType); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + + break; + } + case NI_Vector128_ConvertToUInt32: case NI_Vector256_ConvertToUInt32: case NI_Vector128_ConvertToUInt64: diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 250acb7e570865..0913be60b2ff1f 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -619,6 +619,7 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0xE6), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double +INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x7B), INS_FLAGS_None) // cvt packed quad word to double INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index ebc72fc7f1a8ef..396abe8aecdb11 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -408,8 +408,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { #if defined(TARGET_XARCH) - case NI_VectorT128_ConvertToInt64: - case NI_VectorT256_ConvertToInt64: case NI_VectorT128_ConvertToUInt32: case NI_VectorT256_ConvertToUInt32: case NI_VectorT128_ConvertToUInt64: @@ -805,12 +803,23 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); assert(simdBaseType == TYP_LONG || simdBaseType == TYP_ULONG); - NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector128Double + NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Double : NI_AVX512DQ_VL_ConvertToVector128Double; return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_ConvertToInt64: + case NI_VectorT256_ConvertToInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Int64 + : NI_AVX512DQ_VL_ConvertToVector128Int64; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_ConvertToInt32: case NI_VectorT256_ConvertToInt32: { From 3e4e985326f0f80e8e2c53a34f666f77a8ff329a Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Oct 2022 11:25:31 -0700 Subject: [PATCH 3/5] Accelerates ConvertToUInt64 for double for Vector. --- src/coreclr/jit/emitxarch.cpp | 1 + src/coreclr/jit/emitxarch.h | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 3 +++ src/coreclr/jit/hwintrinsicxarch.cpp | 17 +++++++++++++++-- src/coreclr/jit/instrsxarch.h | 1 + src/coreclr/jit/simdashwintrinsic.cpp | 13 +++++++++++-- 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c19d48ceb4f015..06203e4cd134bc 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -17185,6 +17185,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtqq2pd: case INS_vcvtuqq2pd: case INS_vcvtpd2qq: + case INS_vcvtpd2uqq: { // TODO-XARCH-AVX512: fill these proper result.insLatency += PERFSCORE_LATENCY_1C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 287015b6c59f37..871d2672d7b39c 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -270,6 +270,7 @@ bool IsWEvexOpcodeExtension(instruction ins) case INS_vcvtqq2pd: case INS_vcvtuqq2pd: case INS_vcvtpd2qq: + case INS_vcvtpd2uqq: { return true; // W1 } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 2ec3c5d95a7571..4093c6b6a42ed0 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -840,6 +840,9 @@ HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256Double, HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector128Int64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256Int64, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector128UInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512DQ_VL, ConvertToVector256UInt64, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 310a638729c816..1f6c45129c407e 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -876,10 +876,23 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } - case NI_Vector128_ConvertToUInt32: - case NI_Vector256_ConvertToUInt32: case NI_Vector128_ConvertToUInt64: case NI_Vector256_ConvertToUInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + + intrinsic = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256UInt64 + : NI_AVX512DQ_VL_ConvertToVector128UInt64; + + op1 = impSIMDPopStack(retType); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + + break; + } + + case NI_Vector128_ConvertToUInt32: + case NI_Vector256_ConvertToUInt32: { assert(sig->numArgs == 1); // TODO-XARCH-CQ: These intrinsics should be accelerated diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 0913be60b2ff1f..62c60d95d0cbe1 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -620,6 +620,7 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0xE6), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x7B), INS_FLAGS_None) // cvt packed quad word to double +INST3(vcvtpd2uqq, "cvtpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x79), INS_FLAGS_None) // cvt packed quad word to double INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 396abe8aecdb11..8ac1a58ce2df4b 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -410,8 +410,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, #if defined(TARGET_XARCH) case NI_VectorT128_ConvertToUInt32: case NI_VectorT256_ConvertToUInt32: - case NI_VectorT128_ConvertToUInt64: - case NI_VectorT256_ConvertToUInt64: { // TODO-XARCH-CQ: These intrinsics should be accelerated return nullptr; @@ -830,6 +828,17 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_ConvertToUInt64: + case NI_VectorT256_ConvertToUInt64: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_DOUBLE); + NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256UInt64 + : NI_AVX512DQ_VL_ConvertToVector128UInt64; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_ConvertToSingle: case NI_VectorT256_ConvertToSingle: { From 30f215153d6f4920ddaf6cc37af00476a84d2dac Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Oct 2022 11:41:09 -0700 Subject: [PATCH 4/5] Acclerates ConvertToUInt32 for float for Vector. --- src/coreclr/jit/emitxarch.cpp | 1 + src/coreclr/jit/emitxarch.h | 4 ++++ src/coreclr/jit/hwintrinsiclistxarch.h | 3 +++ src/coreclr/jit/hwintrinsicxarch.cpp | 9 ++++++++- src/coreclr/jit/instrsxarch.h | 1 + src/coreclr/jit/simdashwintrinsic.cpp | 18 +++++++++++------- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 06203e4cd134bc..99fb51e08894b6 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -17186,6 +17186,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtuqq2pd: case INS_vcvtpd2qq: case INS_vcvtpd2uqq: + case INS_vcvtps2udq: { // TODO-XARCH-AVX512: fill these proper result.insLatency += PERFSCORE_LATENCY_1C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 871d2672d7b39c..94a45f1c960883 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -385,9 +385,13 @@ bool IsWEvexOpcodeExtension(instruction ins) case INS_vpdpbusds: case INS_vpdpwssds: case INS_vpermilpsvar: + + // New AVX512 ins + case INS_vcvtps2udq: { return false; // W0 } + default: { return false; // WIG diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 4093c6b6a42ed0..e9163170b26c52 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -829,6 +829,9 @@ HARDWARE_INTRINSIC(X86Serialize, Serialize, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F_VL Intrinsics +HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128UInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector256UInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 1f6c45129c407e..d1da18d4c1b8b3 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -895,7 +895,14 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_ConvertToUInt32: { assert(sig->numArgs == 1); - // TODO-XARCH-CQ: These intrinsics should be accelerated + assert(simdBaseType == TYP_FLOAT); + + intrinsic = (simdSize == 32) ? NI_AVX512F_VL_ConvertToVector256UInt32 + : NI_AVX512F_VL_ConvertToVector128UInt32; + + op1 = impSIMDPopStack(retType); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + break; } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 62c60d95d0cbe1..b2695cf9290866 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -621,6 +621,7 @@ INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x7B), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtpd2uqq, "cvtpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x79), INS_FLAGS_None) // cvt packed quad word to double +INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x79), INS_FLAGS_None) // cvt packed quad word to double INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 8ac1a58ce2df4b..7278b002285f9d 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -408,13 +408,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { #if defined(TARGET_XARCH) - case NI_VectorT128_ConvertToUInt32: - case NI_VectorT256_ConvertToUInt32: - { - // TODO-XARCH-CQ: These intrinsics should be accelerated - return nullptr; - } - case NI_VectorT128_ConvertToSingle: case NI_VectorT256_ConvertToSingle: { @@ -839,6 +832,17 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_ConvertToUInt32: + case NI_VectorT256_ConvertToUInt32: + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_FLOAT); + NamedIntrinsic convert = (simdSize == 32) ? NI_AVX512F_VL_ConvertToVector256UInt32 + : NI_AVX512F_VL_ConvertToVector128UInt32; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_ConvertToSingle: case NI_VectorT256_ConvertToSingle: { From 0ff68621b9a80f6a1e83a680b8aa062f0820c903 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 5 Oct 2022 19:22:30 -0700 Subject: [PATCH 5/5] Accelerates ConvertToSingle for uint for Vector. --- src/coreclr/jit/emitxarch.cpp | 1 + src/coreclr/jit/emitxarch.h | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 3 +++ src/coreclr/jit/hwintrinsicxarch.cpp | 10 ++++++-- src/coreclr/jit/instrsxarch.h | 1 + src/coreclr/jit/simdashwintrinsic.cpp | 34 +++++++++++++------------- 6 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 99fb51e08894b6..63dd33fb865a9c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -17187,6 +17187,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtpd2qq: case INS_vcvtpd2uqq: case INS_vcvtps2udq: + case INS_vcvtudq2ps: { // TODO-XARCH-AVX512: fill these proper result.insLatency += PERFSCORE_LATENCY_1C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 94a45f1c960883..8711f5723ab9fc 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -388,6 +388,7 @@ bool IsWEvexOpcodeExtension(instruction ins) // New AVX512 ins case INS_vcvtps2udq: + case INS_vcvtudq2ps: { return false; // W0 } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index e9163170b26c52..cc36afea96d6b0 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -832,6 +832,9 @@ HARDWARE_INTRINSIC(X86Serialize, Serialize, HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128UInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector256UInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Single, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector256Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index d1da18d4c1b8b3..b375ec974b15d5 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -932,10 +932,16 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(retType); retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } + else if (simdBaseType == TYP_UINT) + { + intrinsic = (simdSize == 32) ? NI_AVX512F_VL_ConvertToVector256Single : NI_AVX512F_VL_ConvertToVector128Single; + + op1 = impSIMDPopStack(retType); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + } else { - // TODO-XARCH-CQ: These intrinsics should be accelerated - assert(simdBaseType == TYP_UINT); + unreached(); } break; } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index b2695cf9290866..ffe9b0637b8bee 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -619,6 +619,7 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA INST3(vcvtqq2pd, "cvtqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0xE6), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtuqq2pd, "cvtuqq2pd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF3, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double +INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x7A), INS_FLAGS_None) // cvt packed unsigned quad word to double INST3(vcvtpd2qq, "cvtpd2qq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x7B), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtpd2uqq, "cvtpd2uqq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x79), INS_FLAGS_None) // cvt packed quad word to double INST3(vcvtps2udq, "cvtps2udq", IUM_WR, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x79), INS_FLAGS_None) // cvt packed quad word to double diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 7278b002285f9d..9d63c232acbd7b 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -407,19 +407,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { -#if defined(TARGET_XARCH) - case NI_VectorT128_ConvertToSingle: - case NI_VectorT256_ConvertToSingle: - { - if (simdBaseType == TYP_UINT) - { - // TODO-XARCH-CQ: These intrinsics should be accelerated - return nullptr; - } - break; - } -#endif // TARGET_XARCH - #if defined(TARGET_X86) case NI_VectorT128_CreateBroadcast: case NI_VectorT256_CreateBroadcast: @@ -846,11 +833,24 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT128_ConvertToSingle: case NI_VectorT256_ConvertToSingle: { - assert(simdBaseType == TYP_INT); - NamedIntrinsic convert = - (simdSize == 32) ? NI_AVX_ConvertToVector256Single : NI_SSE2_ConvertToVector128Single; - return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + if (simdBaseType == TYP_INT) + { + NamedIntrinsic convert = + (simdSize == 32) ? NI_AVX_ConvertToVector256Single : NI_SSE2_ConvertToVector128Single; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + else if (simdBaseType == TYP_UINT) + { + NamedIntrinsic convert = + (simdSize == 32) ? NI_AVX512F_VL_ConvertToVector256Single : NI_AVX512F_VL_ConvertToVector128Single; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); + } + else + { + unreached(); + } } case NI_VectorT128_Sum: