From 5329d32e74c31dfa250378ea180c40cb74e830ea Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 17 Jan 2024 15:53:34 +0800 Subject: [PATCH] check vulkan fp16 uniform support and implement lfp conversion without fp16u (#5287) --- docs/developer-guide/glsl-extension.md | 8 +-- docs/developer-guide/glsl-extension.zh.md | 8 +-- src/gpu.cpp | 71 +++++++++++++++-------- src/gpu.h | 2 + src/net.cpp | 12 ++++ src/option.cpp | 3 + src/option.h | 6 +- tests/testutil.cpp | 10 ++++ 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/docs/developer-guide/glsl-extension.md b/docs/developer-guide/glsl-extension.md index 185ca0e49cb..f870030a7c7 100644 --- a/docs/developer-guide/glsl-extension.md +++ b/docs/developer-guide/glsl-extension.md @@ -170,10 +170,10 @@ declare variable in shared local memory shared lfp tmp_a[8][4][2]; ``` -|local type|fp32|fp16p / fp16s|fp16s + fp16a| -|---|---|---|---| -|lfp|float|float|float16_t| -|lfpvec4|vec4|uvec2|f16vec4| +|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u| +|---|---|---|---|---| +|lfp|float|float|float|float16_t| +|lfpvec4|vec4|uvec2|uint64_t|f16vec4| ## image format and precision hint type diff --git a/docs/developer-guide/glsl-extension.zh.md b/docs/developer-guide/glsl-extension.zh.md index 9b0718adec5..1e856929ac3 100644 --- a/docs/developer-guide/glsl-extension.zh.md +++ b/docs/developer-guide/glsl-extension.zh.md @@ -170,10 +170,10 @@ void main() shared lfp tmp_a[8][4][2]; ``` -|local type|fp32|fp16p / fp16s|fp16s + fp16a| -|---|---|---|---| -|lfp|float|float|float16_t| -|lfpvec4|vec4|uvec2|f16vec4| +|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u| +|---|---|---|---|---| +|lfp|float|float|float|float16_t| +|lfpvec4|vec4|uvec2|uint64_t|f16vec4| ## 图像格式类型(image format type)和精度类型(precision hint type) diff --git a/src/gpu.cpp b/src/gpu.cpp index adba869e1e9..9de89ee4806 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -321,9 +321,11 @@ class GpuInfoPrivate // fp16 and int8 feature bool support_fp16_packed; bool support_fp16_storage; + bool support_fp16_uniform; bool support_fp16_arithmetic; bool support_int8_packed; bool support_int8_storage; + bool support_int8_uniform; bool support_int8_arithmetic; // ycbcr conversion feature @@ -604,6 +606,11 @@ bool GpuInfo::support_fp16_storage() const return d->support_fp16_storage; } +bool GpuInfo::support_fp16_uniform() const +{ + return d->support_fp16_uniform; +} + bool GpuInfo::support_fp16_arithmetic() const { return d->support_fp16_arithmetic; @@ -619,6 +626,11 @@ bool GpuInfo::support_int8_storage() const return d->support_int8_storage; } +bool GpuInfo::support_int8_uniform() const +{ + return d->support_int8_uniform; +} + bool GpuInfo::support_int8_arithmetic() const { return d->support_int8_arithmetic; @@ -1763,9 +1775,11 @@ int create_gpu_instance(const char* driver_path) // check features gpu_info.support_fp16_packed = true; gpu_info.support_fp16_storage = false; + gpu_info.support_fp16_uniform = false; gpu_info.support_fp16_arithmetic = false; gpu_info.support_int8_packed = true; gpu_info.support_int8_storage = false; + gpu_info.support_int8_uniform = false; gpu_info.support_int8_arithmetic = false; gpu_info.support_ycbcr_conversion = false; gpu_info.support_cooperative_matrix = false; @@ -1843,30 +1857,18 @@ int create_gpu_instance(const char* driver_path) if (gpu_info.support_VK_KHR_8bit_storage) { gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess; + gpu_info.support_int8_uniform = query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess; } if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats) { // shaderStorageImageExtendedFormats enables r16f format in storage image gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess; + gpu_info.support_fp16_uniform = query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; } if (gpu_info.support_VK_KHR_shader_float16_int8) { - if (gpu_info.support_fp16_storage) - { - gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16 && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; - } - else - { - gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16; - } - if (gpu_info.support_int8_storage) - { - gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8 && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess; - } - else - { - gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8; - } + gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16; + gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8; } if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion) { @@ -2018,9 +2020,9 @@ int create_gpu_instance(const char* driver_path) NCNN_LOGE("[%u %s] bugsbn1=%d bugbilz=%d bugcopc=%d bugihfa=%d", i, physicalDeviceProperties.deviceName, gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic); - NCNN_LOGE("[%u %s] fp16-p/s/a=%d/%d/%d int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName, - gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, - gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); + NCNN_LOGE("[%u %s] fp16-p/s/u/a=%d/%d/%d/%d int8-p/s/u/a=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName, + gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_uniform, gpu_info.support_fp16_arithmetic, + gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_uniform, gpu_info.support_int8_arithmetic); NCNN_LOGE("[%u %s] subgroup=%u basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName, gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote, @@ -2470,7 +2472,7 @@ VulkanDevice::VulkanDevice(int device_index) enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; enabled8BitStorageFeatures.pNext = 0; enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage(); - enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage() && info.support_int8_arithmetic(); + enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_uniform(); enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE; if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage()) { @@ -2483,7 +2485,7 @@ VulkanDevice::VulkanDevice(int device_index) enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; enabled16BitStorageFeatures.pNext = 0; enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage(); - enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage() && info.support_fp16_arithmetic(); + enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_uniform(); enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE; enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE; if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage()) @@ -3857,11 +3859,16 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("afpmat4", "mat4")); } - if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic) { custom_defines.push_back(std::make_pair("lfp", "float16_t")); custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4")); } + else if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("lfp", "float")); + custom_defines.push_back(std::make_pair("lfpvec4", "uint64_t")); + } else if (opt.use_fp16_storage || opt.use_fp16_packed) { custom_defines.push_back(std::make_pair("lfp", "float")); @@ -3873,7 +3880,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("lfpvec4", "vec4")); } - if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic) { custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v")); @@ -3881,6 +3888,14 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("lfp2afp(v)", "v")); custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v")); } + else if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "float(v)")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "pack64(halfBitsToUInt16(v))")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "int16BitsToHalf(unpack16(v))")); + } else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) { custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); @@ -4208,6 +4223,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1")); } + if (opt.use_fp16_uniform) + { + custom_defines.push_back(std::make_pair("NCNN_fp16_uniform", "1")); + } + if (opt.use_fp16_arithmetic) { custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1")); @@ -4222,6 +4242,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1")); } + if (opt.use_int8_uniform) + { + custom_defines.push_back(std::make_pair("NCNN_int8_uniform", "1")); + } + if (opt.use_int8_arithmetic) { custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1")); diff --git a/src/gpu.h b/src/gpu.h index 2040be544c8..41e0bd0ed01 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -260,9 +260,11 @@ class NCNN_EXPORT GpuInfo // fp16 and int8 feature bool support_fp16_packed() const; bool support_fp16_storage() const; + bool support_fp16_uniform() const; bool support_fp16_arithmetic() const; bool support_int8_packed() const; bool support_int8_storage() const; + bool support_int8_uniform() const; bool support_int8_arithmetic() const; // ycbcr conversion feature diff --git a/src/net.cpp b/src/net.cpp index d9283c69383..971a1b4276e 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1347,8 +1347,11 @@ int Net::load_param(const DataReader& dr) // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; + if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false; if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false; + if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false; if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; @@ -1359,6 +1362,9 @@ int Net::load_param(const DataReader& dr) // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; + + // fp16 uniform makes no sense when fp16 arithmetic disabled + if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; } else { @@ -1637,8 +1643,11 @@ int Net::load_param_bin(const DataReader& dr) // sanitize use options if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false; if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; + if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false; if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false; + if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false; if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; @@ -1649,6 +1658,9 @@ int Net::load_param_bin(const DataReader& dr) // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; + + // fp16 uniform makes no sense when fp16 arithmetic disabled + if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; } else { diff --git a/src/option.cpp b/src/option.cpp index ea2dd6d25c8..a30dabe55f8 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -74,6 +74,9 @@ Option::Option() use_winograd63_convolution = true; use_a53_a55_optimized_kernel = is_current_thread_running_on_a53_a55(); + + use_fp16_uniform = true; + use_int8_uniform = true; } } // namespace ncnn diff --git a/src/option.h b/src/option.h index 7d0cc60ba7d..eb2a5a7d342 100644 --- a/src/option.h +++ b/src/option.h @@ -144,8 +144,10 @@ class NCNN_EXPORT Option // but you can force this on/off if you wish bool use_a53_a55_optimized_kernel; - bool use_reserved_7; - bool use_reserved_8; + // enable options for shared variables in gpu shader + bool use_fp16_uniform; + bool use_int8_uniform; + bool use_reserved_9; bool use_reserved_10; bool use_reserved_11; diff --git a/tests/testutil.cpp b/tests/testutil.cpp index 1bcc6ff5237..b453d1f61b4 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -684,7 +684,12 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vectorinfo.support_fp16_packed()) opt.use_fp16_packed = false; if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; + if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false; + if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false; + if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false; + if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; // FIXME fp16a may produce large error @@ -1179,7 +1184,12 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vectorinfo.support_fp16_packed()) opt.use_fp16_packed = false; if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false; + if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false; if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false; + if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false; + if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false; + if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false; + if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false; if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false; // FIXME fp16a may produce large error