From 9d0142f768aa54f7a00161beb289c74167b70ed1 Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Wed, 15 May 2024 12:41:16 +0200
Subject: [PATCH] Fix ulong->double conversion on x64 (without avx512f)
 (#102179)

---
 src/coreclr/jit/codegen.h                     | 16 ----
 src/coreclr/jit/codegencommon.cpp             |  9 --
 src/coreclr/jit/codegenxarch.cpp              | 96 +++++++++----------
 src/coreclr/jit/lsraxarch.cpp                 |  9 ++
 src/coreclr/jit/simdcodegenxarch.cpp          | 19 ++--
 src/coreclr/jit/utils.cpp                     | 30 +-----
 .../JitBlue/Runtime_102149/Runtime_102149.cs  | 34 +++++++
 .../Runtime_102149/Runtime_102149.csproj      |  8 ++
 8 files changed, 106 insertions(+), 115 deletions(-)
 create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.cs
 create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.csproj

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 077f6cad0e57cd..5441e899bac84b 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -47,22 +47,6 @@ class CodeGen final : public CodeGenInterface
 
 private:
 #if defined(TARGET_XARCH)
-    // Bit masks used in negating a float or double number.
-    // This is to avoid creating more than one data constant for these bitmasks when a
-    // method has more than one GT_NEG operation on floating point values.
-    CORINFO_FIELD_HANDLE negBitmaskFlt;
-    CORINFO_FIELD_HANDLE negBitmaskDbl;
-
-    // Bit masks used in computing Math.Abs() of a float or double number.
-    CORINFO_FIELD_HANDLE absBitmaskFlt;
-    CORINFO_FIELD_HANDLE absBitmaskDbl;
-
-    // Bit mask used in zeroing the 3rd element of a SIMD12
-    CORINFO_FIELD_HANDLE zroSimd12Elm3;
-
-    // Bit mask used in U8 -> double conversion to adjust the result.
-    CORINFO_FIELD_HANDLE u8ToDblBitmask;
-
     // Generates SSE2 code for the given tree as "Operand BitWiseOp BitMask"
     void genSSE2BitwiseOp(GenTree* treeNode);
 
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 31e8f30a48aa6c..a4c4bb9b45e902 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -196,15 +196,6 @@ void CodeGenInterface::CopyRegisterInfo()
 CodeGen::CodeGen(Compiler* theCompiler)
     : CodeGenInterface(theCompiler)
 {
-#if defined(TARGET_XARCH)
-    negBitmaskFlt  = nullptr;
-    negBitmaskDbl  = nullptr;
-    absBitmaskFlt  = nullptr;
-    absBitmaskDbl  = nullptr;
-    zroSimd12Elm3  = nullptr;
-    u8ToDblBitmask = nullptr;
-#endif // defined(TARGET_XARCH)
-
 #if defined(FEATURE_PUT_STRUCT_ARG_STK) && !defined(TARGET_X86)
     m_stkArgVarNum = BAD_VAR_NUM;
 #endif
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 3c652d7de7fea6..f3d9415d592aad 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -7318,47 +7318,53 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     // integral to floating-point conversions all have RMW semantics if VEX support
     // is not available
 
-    bool isRMW = !compiler->canUseVexEncoding();
-    inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
+    const bool isRMW = !compiler->canUseVexEncoding();
 
     // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
     // will interpret ULONG value as LONG.  Hence we need to adjust the
     // result if sign-bit of srcType is set.
     if (srcType == TYP_ULONG)
     {
-        // The instruction sequence below is less accurate than what clang
-        // and gcc generate. However, we keep the current sequence for backward compatibility.
-        // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
-        // should be also updated for consistent conversion result.
         assert(dstType == TYP_DOUBLE);
         assert(op1->isUsedFromReg());
 
-        // Set the flags without modifying op1.
-        // test op1Reg, op1Reg
-        inst_RV_RV(INS_test, op1->GetRegNum(), op1->GetRegNum(), srcType);
+        // The following LONG->DOUBLE cast machinery is based on clang's implementation
+        // with "-ffp-model=strict" flag:
+        //
+        //   mov      tmp1, arg (8 byte)
+        //   shr      tmp1
+        //   mov      tmp2, arg (4 byte)
+        //   and      tmp2, 1
+        //   or       tmp2, tmp1
+        //   test     arg,  arg
+        //   cmovns   tmp2, arg
+        //   cvtsi2sd xmm0, tmp2
+        //   jns     .LABEL
+        //   addsd    xmm0, xmm0
+        //.LABEL
+        //
+        regNumber argReg  = treeNode->gtGetOp1()->GetRegNum();
+        regNumber tmpReg1 = internalRegisters.Extract(treeNode);
+        regNumber tmpReg2 = internalRegisters.Extract(treeNode);
+
+        inst_Mov(TYP_LONG, tmpReg1, argReg, /* canSkip */ false, EA_8BYTE);
+        inst_RV_SH(INS_shr, EA_8BYTE, tmpReg1, 1);
+        inst_Mov(TYP_INT, tmpReg2, argReg, /* canSkip */ false, EA_4BYTE);
+        GetEmitter()->emitIns_R_I(INS_and, EA_4BYTE, tmpReg2, 1);
+        GetEmitter()->emitIns_R_R(INS_or, EA_8BYTE, tmpReg2, tmpReg1);
+        GetEmitter()->emitIns_R_R(INS_test, EA_8BYTE, argReg, argReg);
+        GetEmitter()->emitIns_R_R(INS_cmovns, EA_8BYTE, tmpReg2, argReg);
+        GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, tmpReg2);
 
-        // No need to adjust result if op1 >= 0 i.e. positive
-        // Jge label
         BasicBlock* label = genCreateTempLabel();
-        inst_JMP(EJ_jge, label);
-
-        // Adjust the result
-        // result = result + 0x43f00000 00000000
-        // addsd resultReg,  0x43f00000 00000000
-        CORINFO_FIELD_HANDLE* cns = &u8ToDblBitmask;
-        if (*cns == nullptr)
-        {
-            double d;
-            static_assert_no_msg(sizeof(double) == sizeof(__int64));
-            *((__int64*)&d) = 0x43f0000000000000LL;
-
-            *cns = GetEmitter()->emitFltOrDblConst(d, EA_8BYTE);
-        }
-        GetEmitter()->emitIns_SIMD_R_R_C(INS_addsd, EA_8BYTE, targetReg, targetReg, *cns, 0, INS_OPTS_NONE);
-
+        inst_JMP(EJ_jns, label);
+        GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
         genDefineTempLabel(label);
     }
-
+    else
+    {
+        inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
+    }
     genProduceReg(treeNode);
 }
 
@@ -7685,23 +7691,21 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
 {
     regNumber targetReg  = treeNode->GetRegNum();
     regNumber operandReg = genConsumeReg(treeNode->gtGetOp1());
-    emitAttr  size       = emitTypeSize(treeNode);
 
     assert(varTypeIsFloating(treeNode->TypeGet()));
     assert(treeNode->gtGetOp1()->isUsedFromReg());
 
-    CORINFO_FIELD_HANDLE* maskFld = nullptr;
-    UINT64                mask    = 0;
-    instruction           ins     = INS_invalid;
+    CORINFO_FIELD_HANDLE maskFld = NO_FIELD_HANDLE;
+    UINT64               mask    = 0;
+    instruction          ins     = INS_invalid;
 
     if (treeNode->OperIs(GT_NEG))
     {
         // Neg(x) = flip the sign bit.
         // Neg(f) = f ^ 0x80000000 x4 (packed)
         // Neg(d) = d ^ 0x8000000000000000 x2 (packed)
-        ins     = INS_xorps;
-        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x8000000080000000UL : 0x8000000000000000UL;
-        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &negBitmaskFlt : &negBitmaskDbl;
+        ins  = INS_xorps;
+        mask = treeNode->TypeIs(TYP_FLOAT) ? 0x8000000080000000UL : 0x8000000000000000UL;
     }
     else if (treeNode->OperIs(GT_INTRINSIC))
     {
@@ -7709,30 +7713,24 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
         // Abs(x) = set sign-bit to zero
         // Abs(f) = f & 0x7fffffff x4 (packed)
         // Abs(d) = d & 0x7fffffffffffffff x2 (packed)
-        ins     = INS_andps;
-        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x7FFFFFFF7FFFFFFFUL : 0x7FFFFFFFFFFFFFFFUL;
-        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &absBitmaskFlt : &absBitmaskDbl;
+        ins  = INS_andps;
+        mask = treeNode->TypeIs(TYP_FLOAT) ? 0x7FFFFFFF7FFFFFFFUL : 0x7FFFFFFFFFFFFFFFUL;
     }
     else
     {
         assert(!"genSSE2BitwiseOp: unsupported oper");
     }
 
-    if (*maskFld == nullptr)
-    {
-        simd16_t constValue;
-
-        constValue.u64[0] = mask;
-        constValue.u64[1] = mask;
-
+    simd16_t constValue;
+    constValue.u64[0] = mask;
+    constValue.u64[1] = mask;
 #if defined(FEATURE_SIMD)
-        *maskFld = GetEmitter()->emitSimd16Const(constValue);
+    maskFld = GetEmitter()->emitSimd16Const(constValue);
 #else
-        *maskFld = GetEmitter()->emitBlkConst(&constValue, 16, 16, treeNode->TypeGet());
+    maskFld = GetEmitter()->emitBlkConst(&constValue, 16, 16, treeNode->TypeGet());
 #endif
-    }
 
-    GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0, INS_OPTS_NONE);
+    GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, maskFld, 0, INS_OPTS_NONE);
 }
 
 //-----------------------------------------------------------------------------------------
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 8d376cff4e725c..063dc02a04573d 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -2827,6 +2827,15 @@ int LinearScan::BuildCast(GenTreeCast* cast)
     const var_types srcType  = genActualType(src->TypeGet());
     const var_types castType = cast->gtCastType;
 
+    if ((srcType == TYP_LONG) && (castType == TYP_DOUBLE) &&
+        !compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+    {
+        // We need two extra temp regs for LONG->DOUBLE cast
+        // if we don't have AVX512F available.
+        buildInternalIntRegisterDefForNode(cast);
+        buildInternalIntRegisterDefForNode(cast);
+    }
+
     regMaskTP candidates = RBM_NONE;
 #ifdef TARGET_X86
     if (varTypeIsByte(castType))
diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp
index b344460a100abb..57fcb5d4688004 100644
--- a/src/coreclr/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/jit/simdcodegenxarch.cpp
@@ -536,19 +536,12 @@ void CodeGen::genSimd12UpperClear(regNumber tgtReg)
     else
     {
         // Preserve element 0, 1, and 2; Zero element 3
-
-        if (zroSimd12Elm3 == NO_FIELD_HANDLE)
-        {
-            simd16_t constValue;
-
-            constValue.u32[0] = 0xFFFFFFFF;
-            constValue.u32[1] = 0xFFFFFFFF;
-            constValue.u32[2] = 0xFFFFFFFF;
-            constValue.u32[3] = 0x00000000;
-
-            zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue);
-        }
-
+        simd16_t constValue;
+        constValue.u32[0]                  = 0xFFFFFFFF;
+        constValue.u32[1]                  = 0xFFFFFFFF;
+        constValue.u32[2]                  = 0xFFFFFFFF;
+        constValue.u32[3]                  = 0x00000000;
+        CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue);
         GetEmitter()->emitIns_SIMD_R_R_C(INS_andps, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0, INS_OPTS_NONE);
     }
 }
diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
index cd5b97f0220b13..6a9196bc66d4a7 100644
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
 // ===================================================================================================
@@ -2230,33 +2230,7 @@ unsigned CountDigits(double num, unsigned base /* = 10 */)
 
 double FloatingPointUtils::convertUInt64ToDouble(unsigned __int64 uIntVal)
 {
-    __int64 s64 = uIntVal;
-    double  d;
-    if (s64 < 0)
-    {
-#if defined(TARGET_XARCH)
-        // RyuJIT codegen and clang (or gcc) may produce different results for casting uint64 to
-        // double, and the clang result is more accurate. For example,
-        //    1) (double)0x84595161401484A0UL --> 43e08b2a2c280290  (RyuJIT codegen or VC++)
-        //    2) (double)0x84595161401484A0UL --> 43e08b2a2c280291  (clang or gcc)
-        // If the folding optimization below is implemented by simple casting of (double)uint64_val
-        // and it is compiled by clang, casting result can be inconsistent, depending on whether
-        // the folding optimization is triggered or the codegen generates instructions for casting. //
-        // The current solution is to force the same math as the codegen does, so that casting
-        // result is always consistent.
-
-        // d = (double)(int64_t)uint64 + 0x1p64
-        uint64_t adjHex = 0x43F0000000000000UL;
-        d               = (double)s64 + *(double*)&adjHex;
-#else
-        d = (double)uIntVal;
-#endif
-    }
-    else
-    {
-        d = (double)uIntVal;
-    }
-    return d;
+    return (double)uIntVal;
 }
 
 float FloatingPointUtils::convertUInt64ToFloat(unsigned __int64 u64)
diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.cs b/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.cs
new file mode 100644
index 00000000000000..038f61d1b14e5d
--- /dev/null
+++ b/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.cs
@@ -0,0 +1,34 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Runtime.CompilerServices;
+using Xunit;
+
+public static class Runtime_102149
+{
+    [Fact]
+    public static void Test()
+    {
+        if (BitConverter.DoubleToUInt64Bits(U8_to_R8(Const(9303915604039947368UL))) != 4890948523238023291UL)
+            throw new InvalidOperationException("Case 1 failed");
+
+        if (BitConverter.DoubleToUInt64Bits((double)Const(9303915604039947368UL)) != 4890948523238023291UL)
+            throw new InvalidOperationException("Case 2 failed");
+
+        if (NonConst(10648738977740919977d) != NonConst(10648738977740919977UL))
+            throw new InvalidOperationException("Case 3 failed");
+
+        if (Const(10648738977740919977d) != Const(10648738977740919977UL))
+            throw new InvalidOperationException("Case 4 failed");
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static T Const<T>(T val) => val;
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static T NonConst<T>(T val) => val;
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static double U8_to_R8(ulong x) => x;
+}
diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.csproj
new file mode 100644
index 00000000000000..de6d5e08882e86
--- /dev/null
+++ b/src/tests/JIT/Regression/JitBlue/Runtime_102149/Runtime_102149.csproj
@@ -0,0 +1,8 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <Optimize>True</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+</Project>