From 6fd5b82b2fb704346871579be15dc4ab2459de79 Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 1 Jul 2020 00:39:52 -0400 Subject: [PATCH 1/5] Fix CPU feature specification on virtualized x64. --- src/processor_x86.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 0b8cfe8dccdce..624d8452bf8b2 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -840,6 +840,12 @@ get_llvm_target_noext(const TargetData &data) features.push_back("+sse2"); features.push_back("+mmx"); features.push_back("+fxsr"); +#ifdef _CPU_X86_64_ + // This is required to make LLVM happy if LLVM's feature based CPU arch guess + // returns a value that may not have 64bit support. + // This can happen with virtualization. + features.push_back("+64bit"); +#endif return std::make_pair(std::move(name), std::move(features)); } From 8f6fe5dd360a696174cce15cab8179a26cd07f6c Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 1 Jul 2020 00:42:02 -0400 Subject: [PATCH 2/5] Clean up old LLVM version support on x86 --- src/features_x86.h | 40 ++++++++++++++++++---------------------- src/processor_x86.cpp | 5 ++--- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/features_x86.h b/src/features_x86.h index f5c567cba4e7e..3e7d3d0258312 100644 --- a/src/features_x86.h +++ b/src/features_x86.h @@ -1,11 +1,8 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #ifdef _CPU_X86_ -// avx is unusable on 32bit before LLVM 5.0 due to LLVM bug (try to encode too many registers) -#define JL_X86_AVX_MIN_VER 50000 #define JL_X86_64ONLY_VER(x) UINT32_MAX #else -#define JL_X86_AVX_MIN_VER 0 #define JL_X86_64ONLY_VER(x) x #endif @@ -14,7 +11,7 @@ JL_FEATURE_DEF(sse3, 0, 0) JL_FEATURE_DEF(pclmul, 1, 0) JL_FEATURE_DEF(ssse3, 9, 0) -JL_FEATURE_DEF(fma, 12, JL_X86_AVX_MIN_VER) +JL_FEATURE_DEF(fma, 12, 0) JL_FEATURE_DEF(cx16, 13, JL_X86_64ONLY_VER(0)) // cx16 requires 64bit JL_FEATURE_DEF_NAME(sse41, 19, 0, "sse4.1") JL_FEATURE_DEF_NAME(sse42, 20, 0, "sse4.2") @@ -22,8 +19,8 @@ JL_FEATURE_DEF(movbe, 22, 0) JL_FEATURE_DEF(popcnt, 23, 0) JL_FEATURE_DEF(aes, 25, 0) JL_FEATURE_DEF(xsave, 26, 0) -JL_FEATURE_DEF(avx, 28, JL_X86_AVX_MIN_VER) -JL_FEATURE_DEF(f16c, 29, JL_X86_AVX_MIN_VER) +JL_FEATURE_DEF(avx, 28, 0) +JL_FEATURE_DEF(f16c, 29, 0) JL_FEATURE_DEF(rdrnd, 30, 0) // EAX=1: EDX @@ -34,33 +31,33 @@ JL_FEATURE_DEF(fsgsbase, 32 * 2 + 0, 0) // JL_FEATURE_DEF(sgx, 32 * 2 + 2, 0) // Disable for now since it's very hard to detect JL_FEATURE_DEF(bmi, 32 * 2 + 3, 0) // JL_FEATURE_DEF(hle, 32 * 2 + 4, 0) // Not used and gone in LLVM 5.0 -JL_FEATURE_DEF(avx2, 32 * 2 + 5, JL_X86_AVX_MIN_VER) +JL_FEATURE_DEF(avx2, 32 * 2 + 5, 0) JL_FEATURE_DEF(bmi2, 32 * 2 + 8, 0) // JL_FEATURE_DEF(invpcid, 32 * 2 + 10, 0) // Not used and gone in LLVM 5.0 JL_FEATURE_DEF(rtm, 32 * 2 + 11, 0) JL_FEATURE_DEF(mpx, 32 * 2 + 14, 0) // Disable avx512 pre-5.0 since it can't handle address space -JL_FEATURE_DEF(avx512f, 32 * 2 + 16, 50000) -JL_FEATURE_DEF(avx512dq, 32 * 2 + 17, 50000) +JL_FEATURE_DEF(avx512f, 32 * 2 + 16, 0) +JL_FEATURE_DEF(avx512dq, 32 * 2 + 17, 0) JL_FEATURE_DEF(rdseed, 32 * 2 + 18, 0) JL_FEATURE_DEF(adx, 32 * 2 + 19, 0) // JL_FEATURE_DEF(smap, 32 * 2 + 20, 0) // Not used and gone in LLVM 5.0 -JL_FEATURE_DEF(avx512ifma, 32 * 2 + 21, 50000) +JL_FEATURE_DEF(avx512ifma, 32 * 2 + 21, 0) // JL_FEATURE_DEF(pcommit, 32 * 2 + 22, 0) // Deprecated JL_FEATURE_DEF(clflushopt, 32 * 2 + 23, 0) JL_FEATURE_DEF(clwb, 32 * 2 + 24, 0) -JL_FEATURE_DEF(avx512pf, 32 * 2 + 26, 50000) -JL_FEATURE_DEF(avx512er, 32 * 2 + 27, 50000) -JL_FEATURE_DEF(avx512cd, 32 * 2 + 28, 50000) +JL_FEATURE_DEF(avx512pf, 32 * 2 + 26, 0) +JL_FEATURE_DEF(avx512er, 32 * 2 + 27, 0) +JL_FEATURE_DEF(avx512cd, 32 * 2 + 28, 0) JL_FEATURE_DEF(sha, 32 * 2 + 29, 0) -JL_FEATURE_DEF(avx512bw, 32 * 2 + 30, 50000) -JL_FEATURE_DEF(avx512vl, 32 * 2 + 31, 50000) +JL_FEATURE_DEF(avx512bw, 32 * 2 + 30, 0) +JL_FEATURE_DEF(avx512vl, 32 * 2 + 31, 0) // EAX=7,ECX=0: ECX JL_FEATURE_DEF(prefetchwt1, 32 * 3 + 0, 0) -JL_FEATURE_DEF(avx512vbmi, 32 * 3 + 1, 50000) +JL_FEATURE_DEF(avx512vbmi, 32 * 3 + 1, 0) JL_FEATURE_DEF(pku, 32 * 3 + 4, 0) // ospke -JL_FEATURE_DEF(avx512vpopcntdq, 32 * 3 + 14, 50000) +JL_FEATURE_DEF(avx512vpopcntdq, 32 * 3 + 14, 0) // EAX=7,ECX=0: EDX // JL_FEATURE_DEF(avx512_4vnniw, 32 * 4 + 2, ?????) @@ -72,9 +69,9 @@ JL_FEATURE_DEF(sahf, 32 * 5 + 0, JL_X86_64ONLY_VER(0)) JL_FEATURE_DEF(lzcnt, 32 * 5 + 5, 0) JL_FEATURE_DEF(sse4a, 32 * 5 + 6, 0) JL_FEATURE_DEF(prfchw, 32 * 5 + 8, 0) -JL_FEATURE_DEF(xop, 32 * 5 + 11, JL_X86_AVX_MIN_VER) -JL_FEATURE_DEF(lwp, 32 * 5 + 15, 50000) -JL_FEATURE_DEF(fma4, 32 * 5 + 16, JL_X86_AVX_MIN_VER) +JL_FEATURE_DEF(xop, 32 * 5 + 11, 0) +JL_FEATURE_DEF(lwp, 32 * 5 + 15, 0) +JL_FEATURE_DEF(fma4, 32 * 5 + 16, 0) JL_FEATURE_DEF(tbm, 32 * 5 + 21, 0) JL_FEATURE_DEF(mwaitx, 32 * 5 + 29, 0) @@ -88,7 +85,6 @@ JL_FEATURE_DEF(xsavec, 32 * 7 + 1, 0) JL_FEATURE_DEF(xsaves, 32 * 7 + 3, 0) // EAX=0x80000008: EBX -JL_FEATURE_DEF(clzero, 32 * 8 + 0, 50000) +JL_FEATURE_DEF(clzero, 32 * 8 + 0, 0) -#undef JL_X86_AVX_MIN_VER #undef JL_X86_64ONLY_VER diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 624d8452bf8b2..cb6669af6b878 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -195,7 +195,7 @@ static constexpr CPUSpec cpus[] = { {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, {"bonnell", CPU::intel_atom_bonnell, CPU::generic, 0, Feature::bonnell}, {"silvermont", CPU::intel_atom_silvermont, CPU::generic, 0, Feature::silvermont}, - {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 50000, Feature::goldmont}, + {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 0, Feature::goldmont}, {"core2", CPU::intel_core2, CPU::generic, 0, Feature::core2}, {"yonah", CPU::intel_yonah, CPU::generic, 0, Feature::yonah}, {"prescott", CPU::intel_prescott, CPU::generic, 0, Feature::prescott}, @@ -210,8 +210,7 @@ static constexpr CPUSpec cpus[] = { {"skylake", CPU::intel_corei7_skylake, CPU::generic, 0, Feature::skylake}, {"knl", CPU::intel_knights_landing, CPU::generic, 0, Feature::knl}, {"skylake-avx512", CPU::intel_corei7_skylake_avx512, CPU::generic, 0, Feature::skx}, - {"cannonlake", CPU::intel_corei7_cannonlake, CPU::intel_corei7_skylake_avx512, 40000, - Feature::cannonlake}, + {"cannonlake", CPU::intel_corei7_cannonlake, CPU::generic, 0, Feature::cannonlake}, {"athlon64", CPU::amd_athlon_64, CPU::generic, 0, Feature::generic}, {"athlon-fx", CPU::amd_athlon_fx, CPU::generic, 0, Feature::generic}, From 9f83eaf5c61a17daf494b60bcfd82be7277fdf4b Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 1 Jul 2020 01:36:32 -0400 Subject: [PATCH 3/5] Feature clean up * Deprecated features * Comments * Update dependency and processor feature set --- src/features_x86.h | 9 ++++----- src/processor_x86.cpp | 11 +++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/features_x86.h b/src/features_x86.h index 3e7d3d0258312..75e81bf4a92de 100644 --- a/src/features_x86.h +++ b/src/features_x86.h @@ -33,10 +33,9 @@ JL_FEATURE_DEF(bmi, 32 * 2 + 3, 0) // JL_FEATURE_DEF(hle, 32 * 2 + 4, 0) // Not used and gone in LLVM 5.0 JL_FEATURE_DEF(avx2, 32 * 2 + 5, 0) JL_FEATURE_DEF(bmi2, 32 * 2 + 8, 0) -// JL_FEATURE_DEF(invpcid, 32 * 2 + 10, 0) // Not used and gone in LLVM 5.0 +// JL_FEATURE_DEF(invpcid, 32 * 2 + 10, 0) // Priviledged instruction JL_FEATURE_DEF(rtm, 32 * 2 + 11, 0) -JL_FEATURE_DEF(mpx, 32 * 2 + 14, 0) -// Disable avx512 pre-5.0 since it can't handle address space +// JL_FEATURE_DEF(mpx, 32 * 2 + 14, 0) // Deprecated in LLVM 10.0 JL_FEATURE_DEF(avx512f, 32 * 2 + 16, 0) JL_FEATURE_DEF(avx512dq, 32 * 2 + 17, 0) JL_FEATURE_DEF(rdseed, 32 * 2 + 18, 0) @@ -60,8 +59,8 @@ JL_FEATURE_DEF(pku, 32 * 3 + 4, 0) // ospke JL_FEATURE_DEF(avx512vpopcntdq, 32 * 3 + 14, 0) // EAX=7,ECX=0: EDX -// JL_FEATURE_DEF(avx512_4vnniw, 32 * 4 + 2, ?????) -// JL_FEATURE_DEF(avx512_4fmaps, 32 * 4 + 3, ?????) +// JL_FEATURE_DEF(avx5124vnniw, 32 * 4 + 2, ?????) +// JL_FEATURE_DEF(avx5124fmaps, 32 * 4 + 3, ?????) // EAX=0x80000001: ECX // ignore sahf on 32bit x86 since it is required diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index cb6669af6b878..9d3c83c3690a0 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -143,7 +143,10 @@ static constexpr FeatureDep deps[] = { {sse4a, sse3}, {xop, fma4}, {fma4, avx}, - {fma4, sse4a} + {fma4, sse4a}, + {xsaveopt, xsave}, + {xsavec, xsave}, + {xsaves, xsave}, }; // We require cx16 on 64bit by default. This can be overwritten with `-cx16` @@ -152,7 +155,7 @@ constexpr auto generic = get_feature_masks(cx16); constexpr auto bonnell = get_feature_masks(sse3, ssse3, cx16, movbe, sahf); constexpr auto silvermont = bonnell | get_feature_masks(sse41, sse42, popcnt, pclmul, aes, prfchw); -constexpr auto goldmont = silvermont | get_feature_masks(mpx, sha, rdrnd, rdseed, xsave, +constexpr auto goldmont = silvermont | get_feature_masks(sha, rdrnd, rdseed, xsave, xsaveopt, xsavec, xsaves, clflushopt); constexpr auto yonah = get_feature_masks(sse3); constexpr auto prescott = yonah; @@ -165,7 +168,7 @@ constexpr auto sandybridge = westmere | get_feature_masks(avx, xsave, xsaveopt); constexpr auto ivybridge = sandybridge | get_feature_masks(rdrnd, f16c, fsgsbase); constexpr auto haswell = ivybridge | get_feature_masks(avx2, bmi, bmi2, fma, lzcnt, movbe); constexpr auto broadwell = haswell | get_feature_masks(adx, rdseed, prfchw); -constexpr auto skylake = broadwell | get_feature_masks(mpx, rtm, xsavec, xsaves, +constexpr auto skylake = broadwell | get_feature_masks(rtm, xsavec, xsaves, clflushopt); // ignore sgx; hle constexpr auto knl = broadwell | get_feature_masks(avx512f, avx512er, avx512cd, avx512pf, prefetchwt1); @@ -184,7 +187,7 @@ constexpr auto bdver1 = amdfam10 | get_feature_masks(xop, fma4, avx, ssse3, sse4 prfchw, pclmul, xsave, lwp); constexpr auto bdver2 = bdver1 | get_feature_masks(f16c, bmi, tbm, fma); constexpr auto bdver3 = bdver2 | get_feature_masks(xsaveopt, fsgsbase); -constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx); +constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx, movbe, rdrnd); constexpr auto znver1 = haswell | get_feature_masks(adx, clflushopt, clzero, mwaitx, prfchw, rdseed, sha, sse4a, xsavec, xsaves); From 9ad9daa20af45aa5789e3418a2d8446fe33bc1ff Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 1 Jul 2020 01:50:55 -0400 Subject: [PATCH 4/5] New X86 features and detections --- src/features_x86.h | 27 +++++++++++++++++++++ src/processor_x86.cpp | 56 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/src/features_x86.h b/src/features_x86.h index 75e81bf4a92de..6fc8fa0b303e6 100644 --- a/src/features_x86.h +++ b/src/features_x86.h @@ -56,11 +56,31 @@ JL_FEATURE_DEF(avx512vl, 32 * 2 + 31, 0) JL_FEATURE_DEF(prefetchwt1, 32 * 3 + 0, 0) JL_FEATURE_DEF(avx512vbmi, 32 * 3 + 1, 0) JL_FEATURE_DEF(pku, 32 * 3 + 4, 0) // ospke +JL_FEATURE_DEF(waitpkg, 32 * 3 + 5, 0) +JL_FEATURE_DEF(avx512vbmi2, 32 * 3 + 6, 0) +JL_FEATURE_DEF(shstk, 32 * 3 + 7, 0) +JL_FEATURE_DEF(gfni, 32 * 3 + 8, 0) +JL_FEATURE_DEF(vaes, 32 * 3 + 9, 0) +JL_FEATURE_DEF(vpclmulqdq, 32 * 3 + 10, 0) +JL_FEATURE_DEF(avx512vnni, 32 * 3 + 11, 0) +JL_FEATURE_DEF(avx512bitalg, 32 * 3 + 12, 0) JL_FEATURE_DEF(avx512vpopcntdq, 32 * 3 + 14, 0) +JL_FEATURE_DEF(rdpid, 32 * 3 + 22, 0) +JL_FEATURE_DEF(cldemote, 32 * 3 + 25, 0) +JL_FEATURE_DEF(movdiri, 32 * 3 + 27, 0) +JL_FEATURE_DEF(movdir64b, 32 * 3 + 28, 0) +JL_FEATURE_DEF(enqcmd, 32 * 3 + 29, 90000) // EAX=7,ECX=0: EDX // JL_FEATURE_DEF(avx5124vnniw, 32 * 4 + 2, ?????) // JL_FEATURE_DEF(avx5124fmaps, 32 * 4 + 3, ?????) +JL_FEATURE_DEF(avx512vp2intersect, 32 * 4 + 8, 90000) +JL_FEATURE_DEF(serialize, 32 * 4 + 14, 110000) +JL_FEATURE_DEF(tsxldtrk, 32 * 4 + 16, 110000) +JL_FEATURE_DEF(pconfig, 32 * 4 + 18, 0) +JL_FEATURE_DEF_NAME(amx_bf16, 32 * 4 + 22, 110000, "amx-bf16") +JL_FEATURE_DEF_NAME(amx_tile, 32 * 4 + 24, 110000, "amx-tile") +JL_FEATURE_DEF_NAME(amx_int8, 32 * 4 + 25, 110000, "amx-int8") // EAX=0x80000001: ECX // ignore sahf on 32bit x86 since it is required @@ -85,5 +105,12 @@ JL_FEATURE_DEF(xsaves, 32 * 7 + 3, 0) // EAX=0x80000008: EBX JL_FEATURE_DEF(clzero, 32 * 8 + 0, 0) +JL_FEATURE_DEF(wbnoinvd, 32 * 8 + 9, 0) + +// EAX=7,ECX=1: EAX +JL_FEATURE_DEF(avx512bf16, 32 * 9 + 5, 90000) + +// EAX=0x14,ECX=0: EBX +JL_FEATURE_DEF(ptwrite, 32 * 10 + 4, 0) #undef JL_X86_64ONLY_VER diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 9d3c83c3690a0..1f400ac54d345 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -92,7 +92,7 @@ enum class CPU : uint32_t { amd_znver1, }; -static constexpr size_t feature_sz = 9; +static constexpr size_t feature_sz = 11; static constexpr FeatureName feature_names[] = { #define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, #define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, @@ -130,6 +130,10 @@ static constexpr FeatureDep deps[] = { {avx, sse42}, {f16c, avx}, {avx2, avx}, + {vaes, avx}, + {vaes, aes}, + {vpclmulqdq, avx}, + {vpclmulqdq, pclmul}, {avx512f, avx2}, {avx512dq, avx512f}, {avx512ifma, avx512f}, @@ -137,9 +141,16 @@ static constexpr FeatureDep deps[] = { {avx512er, avx512f}, {avx512cd, avx512f}, {avx512bw, avx512f}, + {avx512bf16, avx512bw}, + {avx512bitalg, avx512bw}, {avx512vl, avx512f}, {avx512vbmi, avx512bw}, + {avx512vbmi2, avx512bw}, + {avx512vnni, avx512f}, + {avx512vp2intersect, avx512f}, {avx512vpopcntdq, avx512f}, + {amx_int8, amx_tile}, + {amx_bf16, amx_tile}, {sse4a, sse3}, {xop, fma4}, {fma4, avx}, @@ -470,7 +481,8 @@ static inline void features_disable_avx512(T &features) { using namespace Feature; unset_bits(features, avx512f, avx512dq, avx512ifma, avx512pf, avx512er, avx512cd, - avx512bw, avx512vl, avx512vbmi); + avx512bw, avx512vl, avx512vbmi, avx512vpopcntdq, avx512vbmi2, avx512vnni, + avx512bitalg, avx512vp2intersect, avx512bf16); } template @@ -478,7 +490,14 @@ static inline void features_disable_avx(T &features) { using namespace Feature; unset_bits(features, avx, Feature::fma, f16c, xsave, avx2, xop, fma4, - xsaveopt, xsavec, xsaves); + xsaveopt, xsavec, xsaves, vaes, vpclmulqdq); +} + +template +static inline void features_disable_amx(T &features) +{ + using namespace Feature; + unset_bits(features, amx_bf16, amx_tile, amx_int8); } static NOINLINE std::pair> _get_host_cpu(void) @@ -535,15 +554,25 @@ static NOINLINE std::pair> _get_host_cpu(void) jl_cpuidex(infoex8, 0x80000008, 0); features[8] = infoex8[1]; } + if (maxleaf >= 7) { + int32_t info7[4]; + jl_cpuidex(info7, 7, 1); + features[9] = info7[0]; + } + if (maxleaf >= 0x14) { + int32_t info14[4]; + jl_cpuidex(info14, 0x14, 0); + features[10] = info14[1]; + } // Fix up AVX bits to account for OS support and match LLVM model uint64_t xcr0 = 0; - const uint32_t avx_mask = (1 << 27) | (1 << 28); - bool hasavx = test_all_bits(features[0], avx_mask); - if (hasavx) { + bool hasxsave = test_all_bits(features[0], 1 << 27); + if (hasxsave) { xcr0 = get_xcr0(); - hasavx = test_all_bits(xcr0, 0x6); + hasxsave = test_all_bits(xcr0, 0x6); } + bool hasavx = hasxsave && test_all_bits(features[0], 1 << 28); unset_bits(features, 32 + 27); if (!hasavx) features_disable_avx(features); @@ -557,6 +586,10 @@ static NOINLINE std::pair> _get_host_cpu(void) #endif if (!hasavx512save) features_disable_avx512(features); + // AMX requires additional context to be saved by the OS. + bool hasamxsave = hasxsave && test_all_bits(xcr0, (1 << 17) | (1 << 18)); + if (!hasamxsave) + features_disable_amx(features); // Ignore feature bits that we are not interested in. mask_features(feature_masks, &features[0]); @@ -788,12 +821,16 @@ static void ensure_jit_target(bool imaging) static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3, Feature::sse41, Feature::sse42, Feature::avx, Feature::avx2, + Feature::vaes, Feature::vpclmulqdq, Feature::sse4a, Feature::avx512f, Feature::avx512dq, Feature::avx512ifma, Feature::avx512pf, Feature::avx512er, Feature::avx512cd, Feature::avx512bw, Feature::avx512vl, Feature::avx512vbmi, - Feature::avx512vpopcntdq}; + Feature::avx512vpopcntdq, + Feature::avx512vbmi2, Feature::avx512vnni, + Feature::avx512bitalg, Feature::avx512bf16, + Feature::avx512vp2intersect}; for (auto fe: clone_math) { if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { t.en.flags |= JL_TARGET_CLONE_MATH; @@ -847,6 +884,9 @@ get_llvm_target_noext(const TargetData &data) // returns a value that may not have 64bit support. // This can happen with virtualization. features.push_back("+64bit"); +#endif +#if JL_LLVM_VERSION >= 90000 + features.push_back("+cx8"); #endif return std::make_pair(std::move(name), std::move(features)); } From 87c609a8a92330b4cd4e0fdc81ee77c20e1a3c6d Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Wed, 1 Jul 2020 11:18:04 -0400 Subject: [PATCH 5/5] New X86 CPU types and detections --- src/processor_x86.cpp | 112 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 20 deletions(-) diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index 1f400ac54d345..cbe9449ab159c 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -60,6 +60,8 @@ enum class CPU : uint32_t { intel_atom_bonnell, intel_atom_silvermont, intel_atom_goldmont, + intel_atom_goldmont_plus, + intel_atom_tremont, intel_core2, intel_core2_penryn, intel_yonah, @@ -71,8 +73,14 @@ enum class CPU : uint32_t { intel_corei7_broadwell, intel_corei7_skylake, intel_corei7_skylake_avx512, + intel_corei7_cascadelake, + intel_corei7_cooperlake, intel_corei7_cannonlake, + intel_corei7_icelake_client, + intel_corei7_icelake_server, + intel_corei7_tigerlake, intel_knights_landing, + intel_knights_mill, amd_fam10h, amd_athlon_fx, @@ -90,6 +98,7 @@ enum class CPU : uint32_t { amd_opteron_sse3, amd_barcelona, amd_znver1, + amd_znver2, }; static constexpr size_t feature_sz = 11; @@ -165,27 +174,41 @@ static constexpr FeatureDep deps[] = { constexpr auto generic = get_feature_masks(cx16); constexpr auto bonnell = get_feature_masks(sse3, ssse3, cx16, movbe, sahf); constexpr auto silvermont = bonnell | get_feature_masks(sse41, sse42, popcnt, - pclmul, aes, prfchw); -constexpr auto goldmont = silvermont | get_feature_masks(sha, rdrnd, rdseed, xsave, - xsaveopt, xsavec, xsaves, clflushopt); + pclmul, prfchw, rdrnd); +constexpr auto goldmont = silvermont | get_feature_masks(aes, sha, rdseed, xsave, xsaveopt, + xsavec, xsaves, clflushopt, fsgsbase); +constexpr auto goldmont_plus = goldmont | get_feature_masks(ptwrite, rdpid); // sgx +constexpr auto tremont = goldmont_plus | get_feature_masks(clwb, gfni); +constexpr auto knl = get_feature_masks(sse3, ssse3, sse41, sse42, cx16, sahf, popcnt, + aes, pclmul, avx, xsave, xsaveopt, rdrnd, f16c, fsgsbase, + avx2, bmi, bmi2, fma, lzcnt, movbe, adx, rdseed, prfchw, + avx512f, avx512er, avx512cd, avx512pf, prefetchwt1); +constexpr auto knm = knl | get_feature_masks(avx512vpopcntdq); constexpr auto yonah = get_feature_masks(sse3); constexpr auto prescott = yonah; constexpr auto core2 = get_feature_masks(sse3, ssse3, cx16, sahf); constexpr auto nocona = get_feature_masks(sse3, cx16); constexpr auto penryn = nocona | get_feature_masks(ssse3, sse41, sahf); constexpr auto nehalem = penryn | get_feature_masks(sse42, popcnt); -constexpr auto westmere = nehalem | get_feature_masks(aes, pclmul); +constexpr auto westmere = nehalem | get_feature_masks(pclmul); constexpr auto sandybridge = westmere | get_feature_masks(avx, xsave, xsaveopt); constexpr auto ivybridge = sandybridge | get_feature_masks(rdrnd, f16c, fsgsbase); constexpr auto haswell = ivybridge | get_feature_masks(avx2, bmi, bmi2, fma, lzcnt, movbe); constexpr auto broadwell = haswell | get_feature_masks(adx, rdseed, prfchw); -constexpr auto skylake = broadwell | get_feature_masks(rtm, xsavec, xsaves, - clflushopt); // ignore sgx; hle -constexpr auto knl = broadwell | get_feature_masks(avx512f, avx512er, avx512cd, avx512pf, - prefetchwt1); +constexpr auto skylake = broadwell | get_feature_masks(aes, xsavec, xsaves, clflushopt); // sgx constexpr auto skx = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl, pku, clwb); -constexpr auto cannonlake = skx | get_feature_masks(avx512vbmi, avx512ifma, sha); +constexpr auto cascadelake = skx | get_feature_masks(avx512vnni); +constexpr auto cooperlake = cascadelake | get_feature_masks(avx512bf16); +constexpr auto cannonlake = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, + avx512vl, pku, avx512vbmi, avx512ifma, + sha); // sgx +constexpr auto icelake = cannonlake | get_feature_masks(avx512bitalg, vaes, avx512vbmi2, + vpclmulqdq, avx512vpopcntdq, + gfni, clwb, rdpid); +constexpr auto icelake_server = icelake | get_feature_masks(pconfig, wbnoinvd); +constexpr auto tigerlake = icelake | get_feature_masks(avx512vp2intersect, movdiri, + movdir64b, shstk); constexpr auto k8_sse3 = get_feature_masks(sse3, cx16); constexpr auto amdfam10 = k8_sse3 | get_feature_masks(sse4a, lzcnt, popcnt, sahf); @@ -200,8 +223,9 @@ constexpr auto bdver2 = bdver1 | get_feature_masks(f16c, bmi, tbm, fma); constexpr auto bdver3 = bdver2 | get_feature_masks(xsaveopt, fsgsbase); constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx, movbe, rdrnd); -constexpr auto znver1 = haswell | get_feature_masks(adx, clflushopt, clzero, mwaitx, prfchw, +constexpr auto znver1 = haswell | get_feature_masks(adx, aes, clflushopt, clzero, mwaitx, prfchw, rdseed, sha, sse4a, xsavec, xsaves); +constexpr auto znver2 = znver1 | get_feature_masks(clwb, rdpid, wbnoinvd); } @@ -210,6 +234,8 @@ static constexpr CPUSpec cpus[] = { {"bonnell", CPU::intel_atom_bonnell, CPU::generic, 0, Feature::bonnell}, {"silvermont", CPU::intel_atom_silvermont, CPU::generic, 0, Feature::silvermont}, {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 0, Feature::goldmont}, + {"goldmont-plus", CPU::intel_atom_goldmont_plus, CPU::generic, 0, Feature::goldmont_plus}, + {"tremont", CPU::intel_atom_tremont, CPU::generic, 0, Feature::tremont}, {"core2", CPU::intel_core2, CPU::generic, 0, Feature::core2}, {"yonah", CPU::intel_yonah, CPU::generic, 0, Feature::yonah}, {"prescott", CPU::intel_prescott, CPU::generic, 0, Feature::prescott}, @@ -223,8 +249,17 @@ static constexpr CPUSpec cpus[] = { {"broadwell", CPU::intel_corei7_broadwell, CPU::generic, 0, Feature::broadwell}, {"skylake", CPU::intel_corei7_skylake, CPU::generic, 0, Feature::skylake}, {"knl", CPU::intel_knights_landing, CPU::generic, 0, Feature::knl}, + {"knm", CPU::intel_knights_mill, CPU::generic, 0, Feature::knm}, {"skylake-avx512", CPU::intel_corei7_skylake_avx512, CPU::generic, 0, Feature::skx}, + {"cascadelake", CPU::intel_corei7_cascadelake, CPU::generic, 0, Feature::cascadelake}, + {"cooperlake", CPU::intel_corei7_cooperlake, CPU::intel_corei7_cascadelake, + 90000, Feature::cooperlake}, {"cannonlake", CPU::intel_corei7_cannonlake, CPU::generic, 0, Feature::cannonlake}, + {"icelake-client", CPU::intel_corei7_icelake_client, CPU::generic, 0, Feature::icelake}, + {"icelake-server", CPU::intel_corei7_icelake_server, CPU::generic, 0, + Feature::icelake_server}, + {"tigerlake", CPU::intel_corei7_tigerlake, CPU::intel_corei7_icelake_client, 100000, + Feature::tigerlake}, {"athlon64", CPU::amd_athlon_64, CPU::generic, 0, Feature::generic}, {"athlon-fx", CPU::amd_athlon_fx, CPU::generic, 0, Feature::generic}, @@ -247,6 +282,7 @@ static constexpr CPUSpec cpus[] = { {"bdver4", CPU::amd_bdver4, CPU::generic, 0, Feature::bdver4}, {"znver1", CPU::amd_znver1, CPU::generic, 0, Feature::znver1}, + {"znver2", CPU::amd_znver2, CPU::amd_znver1, 90000, Feature::znver2}, }; static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]); @@ -351,11 +387,37 @@ static CPU get_intel_processor_name(uint32_t family, uint32_t model, uint32_t br case 0x5e: // Skylake desktop case 0x8e: // Kaby Lake mobile case 0x9e: // Kaby Lake desktop + case 0xa5: // Comet Lake-H/S + case 0xa6: // Comet Lake-U return CPU::intel_corei7_skylake; // Skylake Xeon: case 0x55: - return CPU::intel_corei7_skylake; + if (test_nbit(features, Feature::avx512bf16)) + return CPU::intel_corei7_cooperlake; + if (test_nbit(features, Feature::avx512vnni)) + return CPU::intel_corei7_cascadelake; + return CPU::intel_corei7_skylake_avx512; + + // Cannonlake: + case 0x66: + return CPU::intel_corei7_cannonlake; + + // Icelake: + case 0x7d: + case 0x7e: + case 0x9d: + return CPU::intel_corei7_icelake_client; + + // Icelake Xeon: + case 0x6a: + case 0x6c: + return CPU::intel_corei7_icelake_server; + + // Tiger Lake + case 0x8c: + case 0x8d: + return CPU::intel_corei7_tigerlake; case 0x1c: // Most 45 nm Intel Atom processors case 0x26: // 45 nm Atom Lincroft @@ -368,19 +430,30 @@ static CPU get_intel_processor_name(uint32_t family, uint32_t model, uint32_t br case 0x37: case 0x4a: case 0x4d: - case 0x5a: case 0x5d: - case 0x4c: // really airmont + // Airmont + case 0x4c: + case 0x5a: + case 0x75: return CPU::intel_atom_silvermont; // Goldmont: case 0x5c: case 0x5f: return CPU::intel_atom_goldmont; + case 0x7a: + return CPU::intel_atom_goldmont_plus; + case 0x86: + case 0x96: + case 0x9c: + return CPU::intel_atom_tremont; case 0x57: return CPU::intel_knights_landing; + case 0x85: + return CPU::intel_knights_mill; + default: return CPU::generic; } @@ -454,8 +527,6 @@ static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_ case 20: return CPU::amd_btver1; case 21: - if (!test_nbit(features, Feature::avx)) - return CPU::amd_btver1; if (model >= 0x50 && model <= 0x6f) return CPU::amd_bdver4; if (model >= 0x30 && model <= 0x3f) @@ -466,11 +537,11 @@ static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_ return CPU::amd_bdver1; return CPU::amd_btver1; // fallback case 22: - if (!test_nbit(features, Feature::avx)) - return CPU::amd_btver1; return CPU::amd_btver2; case 23: - if (test_nbit(features, Feature::adx)) + if ((model >= 0x30 && model <= 0x3f) || model == 0x71) + return CPU::amd_znver2; + if (model <= 0x0f) return CPU::amd_znver1; return CPU::amd_btver1; } @@ -810,9 +881,10 @@ static void ensure_jit_target(bool imaging) // The most useful one in general... t.en.flags |= JL_TARGET_CLONE_LOOP; auto &features0 = jit_targets[t.base].en.features; - // Special case for KNL since it's so different + // Special case for KNL/KNM since they're so different if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) { - if (t.name == "knl" && jit_targets[t.base].name != "knl") { + if ((t.name == "knl" || t.name == "knm") && + jit_targets[t.base].name != "knl" && jit_targets[t.base].name != "knm") { t.en.flags |= JL_TARGET_CLONE_ALL; break; }