diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b532d7c245..90a3b50d202 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,10 +140,11 @@ endif()
 
 include(CheckCXXCompilerFlag)
 set(CMAKE_TRY_COMPILE_CONFIGURATION release)
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 
 # gnu inline assembly in clang msvc does not work actually
 if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")))
-    check_cxx_source_compiles("int main() { int a = 0; asm volatile(\"\" : \"=r\"(a) : \"0\"(a) : \"memory\"); return 0; }" NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM)
+    check_cxx_source_compiles("int test(int a) { asm volatile(\"\" : \"=r\"(a) : \"0\"(a) : \"memory\"); return a; }" NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM)
     if(NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM)
         option(NCNN_GNU_INLINE_ASM "optimize platform with gnu style inline assembly" ON)
     else()
@@ -163,21 +164,21 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
     endif()
 
     if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32)
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s, _a, _b; _s = vmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM_NEON)
+        check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float32x4_t a, float32x4_t b) { return vmlaq_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM_NEON)
 
         if(NCNN_COMPILER_SUPPORT_ARM_NEON)
             if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
                 set(CMAKE_REQUIRED_FLAGS "/arch:VFPv4")
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
+                check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
                 unset(CMAKE_REQUIRED_FLAGS)
             else()
                 set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4")
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
+                check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
                 if(NOT NCNN_COMPILER_SUPPORT_ARM_VFPV4)
                     set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4 -mfp16-format=ieee")
-                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
+                    check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
                 endif()
 
                 unset(CMAKE_REQUIRED_FLAGS)
@@ -194,107 +195,107 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
     if(CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32)
         if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
 
             unset(CMAKE_REQUIRED_FLAGS)
         elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+dotprod")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16fml")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+bf16")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+i8mm")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve2")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+bf16")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+i8mm")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
 
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+f32mm")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
 
             unset(CMAKE_REQUIRED_FLAGS)
         else()
             set(CMAKE_REQUIRED_FLAGS "-march=armv8-a")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
-            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
+            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
 
             set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm")
-            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
+            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
 
             unset(CMAKE_REQUIRED_FLAGS)
         endif()
@@ -380,7 +381,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
     check_cxx_compiler_flag("-mmsa" NCNN_COMPILER_SUPPORT_MIPS_MSA)
 
     set(CMAKE_REQUIRED_FLAGS "-mloongson-mmi -I${CMAKE_CURRENT_SOURCE_DIR}/src/layer/mips")
-    check_cxx_source_compiles("#include \"loongson_mmi.h\"\nint main() { int16x4_t _a, _b; int32x2_t _s = __mmi_pmaddhw(_a, _b); return 0; }" NCNN_COMPILER_SUPPORT_LOONGSON_MMI)
+    check_cxx_source_compiles("#include \"loongson_mmi.h\"\nint32x2_t test(int16x4_t a, int16x4_t b) { return __mmi_pmaddhw(a, b); }" NCNN_COMPILER_SUPPORT_LOONGSON_MMI)
 
     unset(CMAKE_REQUIRED_FLAGS)
 
@@ -398,10 +399,10 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
     set(NCNN_TARGET_ARCH loongarch)
 
     set(CMAKE_REQUIRED_FLAGS "-mlsx")
-    check_cxx_source_compiles("#include <lsxintrin.h>\nint main() { __m128 _s, _a, _b, _c; _s = __lsx_vfmadd_s(_a, _b, _c); return 0; }" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+    check_cxx_source_compiles("#include <lsxintrin.h>\n__m128 test(__m128 a, __m128 b, __m128 c) { return __lsx_vfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
 
     set(CMAKE_REQUIRED_FLAGS "-mlasx")
-    check_cxx_source_compiles("#include <lasxintrin.h>\nint main() { __m256 _s, _a, _b, _c; _s = __lasx_xvfmadd_s(_a, _b, _c); return 0; }" NCNN_COMPILER_SUPPORT_LOONGARCH_LASX)
+    check_cxx_source_compiles("#include <lasxintrin.h>\n__m256 test(__m256 a, __m256 b, __m256 c) { return __lasx_xvfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LASX)
 
     unset(CMAKE_REQUIRED_FLAGS)
 
@@ -421,16 +422,16 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
 
     if(CMAKE_SIZEOF_VOID_P EQUAL 8)
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
-        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); vfloat32m1_t _x; vfloat32m1x2_t _xx = __riscv_vcreate_v_f32m1x2(_x, _x); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V)
+        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat32m8_t test(vfloat32m8_t s, vfloat32m8_t w, float v, size_t vl) { return __riscv_vfmacc_vf_f32m8(s, v, w, vl); }\nvfloat32m1x2_t test2(vfloat32m1_t x) { return __riscv_vcreate_v_f32m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_V)
 
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16")
-        check_cxx_source_compiles("int main() { __fp16 s, v; s = v * v; return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH)
+        check_cxx_source_compiles("__fp16 test(__fp16 a) { return a * a; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH)
 
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
-        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
+        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
 
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
-        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); vfloat32m1_t _x; vfloat32m1x2_t _xx = __riscv_vcreate_v_f32m1x2(_x, _x); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
+        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
 
         unset(CMAKE_REQUIRED_FLAGS)
 
@@ -467,11 +468,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
         set(NCNN_TARGET_ARCH x86)
 
         set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE2__")
-        check_cxx_source_compiles("#include <emmintrin.h>\nint main() { return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
+        check_cxx_source_compiles("#include <emmintrin.h>\n__m128i test(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
         unset(CMAKE_REQUIRED_FLAGS)
 
         set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__")
-        check_cxx_source_compiles("#include <smmintrin.h>\nint main() { __m128i _v, _a, _b; _v = _mm_packus_epi32(_a, _b); return 0; }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
+        check_cxx_source_compiles("#include <smmintrin.h>\n__m128i test(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
         unset(CMAKE_REQUIRED_FLAGS)
 
         if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
@@ -501,105 +502,130 @@ else()
     option(NCNN_SSE2 "optimize x86 platform with sse2 extension" ON)
 
     if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-        check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_AVX)
-        check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_FMA)
-        check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_XOP)
-        check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_F16C)
-        check_cxx_compiler_flag("/arch:AVX2" NCNN_COMPILER_SUPPORT_X86_AVX2)
-        check_cxx_compiler_flag("/arch:AVX512" NCNN_COMPILER_SUPPORT_X86_AVX512)
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+        check_cxx_source_compiles("#include <immintrin.h>\n#include <ammintrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpbssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwsud_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256 _a; __m128bh _s = _mm256_cvtneps_avx_pbh(_a); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
 
         unset(CMAKE_REQUIRED_FLAGS)
     elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
         check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE)
 
-        check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_AVX)
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mfma -mf16c")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256 _s, _a, _b; _s = _mm256_fmadd_ps(_a, _b, _s); return 0; }" NCNN_COMPILER_SUPPORT_X86_FMA)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mxop")
-        check_cxx_source_compiles("#include <x86intrin.h>\nint main() { __m128 _s, _a, _b; _s = _mm_maddd_epi16(_a, _b, _s); return 0; }" NCNN_COMPILER_SUPPORT_X86_XOP)
+        check_cxx_source_compiles("#include <x86intrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)
 
-        check_cxx_compiler_flag("/arch:AVX -mf16c" NCNN_COMPILER_SUPPORT_X86_F16C)
-        check_cxx_compiler_flag("/arch:AVX2 -mfma -mf16c" NCNN_COMPILER_SUPPORT_X86_AVX2)
-        check_cxx_compiler_flag("/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl" NCNN_COMPILER_SUPPORT_X86_AVX512)
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mf16c")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)
+
+        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpbssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwsud_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxneconvert")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256 _a; __m128bh _s = _mm256_cvtneps_avx_pbh(_a); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
 
         unset(CMAKE_REQUIRED_FLAGS)
     else()
         check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE)
 
-        check_cxx_compiler_flag("-mavx" NCNN_COMPILER_SUPPORT_X86_AVX)
+        set(CMAKE_REQUIRED_FLAGS "-mavx")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256 _s, _a, _b; _s = _mm256_fmadd_ps(_a, _b, _s); return 0; }" NCNN_COMPILER_SUPPORT_X86_FMA)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS "-mfma -mxop")
+        check_cxx_source_compiles("#include <x86intrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)
 
-        check_cxx_compiler_flag("-mxop" NCNN_COMPILER_SUPPORT_X86_XOP)
-        check_cxx_compiler_flag("-mf16c" NCNN_COMPILER_SUPPORT_X86_F16C)
-        check_cxx_compiler_flag("-mfma -mf16c -mavx2" NCNN_COMPILER_SUPPORT_X86_AVX2)
-        check_cxx_compiler_flag("-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" NCNN_COMPILER_SUPPORT_X86_AVX512)
+        set(CMAKE_REQUIRED_FLAGS "-mf16c")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)
+
+        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)
+
+        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint8")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpbssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwsud_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxneconvert")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256 _a; __m128bh _s = _mm256_cvtneps_avx_pbh(_a); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16")
-        check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
+        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
 
         unset(CMAKE_REQUIRED_FLAGS)
     endif()
@@ -695,6 +721,9 @@ else()
     endif()
 endif()
 
+unset(CMAKE_TRY_COMPILE_CONFIGURATION)
+unset(CMAKE_TRY_COMPILE_TARGET_TYPE)
+
 if(NCNN_TARGET_ILP32)
     message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 64bit ilp32")
 elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)