diff --git a/src/layer/arm/requantize_arm.cpp b/src/layer/arm/requantize_arm.cpp index 32fdd961433..d5fe92428b3 100644 --- a/src/layer/arm/requantize_arm.cpp +++ b/src/layer/arm/requantize_arm.cpp @@ -24,13 +24,6 @@ namespace ncnn { -#if __ARM_NEON -#include "requantize_leakyrelu_pack4.h" -#include "requantize_leakyrelu_pack8.h" -#include "requantize_relu_pack4.h" -#include "requantize_relu_pack8.h" -#endif // __ARM_NEON - Requantize_arm::Requantize_arm() { #if __ARM_NEON @@ -38,1328 +31,588 @@ Requantize_arm::Requantize_arm() #endif // __ARM_NEON } -int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; - -#if __ARM_NEON - if (elempack == 8) - { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); - float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in); - _v1 = vmulq_f32(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias, _v0, _scale_in); - _v1 = vmlaq_f32(_bias, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { - float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // NCNN_LOGE("requantize_relu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in); - _v1 = vmulq_f32(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias, _v0, _scale_in); - _v1 = vmlaq_f32(_bias, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { - float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out); - _v1 = vmulq_f32(_v1, _scale_out); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - } - } - } + float scale_in = scale_in_data[0]; +#if __ARM_NEON + float32x4_t _scale_in0 = vdupq_n_f32(scale_in); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); } - - if (dims == 2) + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr += 8; - ptr += 8; - } - } - } + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = _scale_in0; } + } +#endif // __ARM_NEON - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __ARM_NEON + float32x4_t _scale_out0 = vdupq_n_f32(scale_out); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (activation_type == 1) - { - requantize_relu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; - } - - if (activation_type == 2 && activation_params[0] > 0.f) - { - requantize_leakyrelu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; - } - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); + } + if (elempack == 4) + { + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = _scale_out0; + } + } +#endif // __ARM_NEON - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4))); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); + float scale = scale_in * scale_out; +#if __ARM_NEON + float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); + float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); +#endif // __ARM_NEON - intptr += 8; - ptr += 8; - } - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(ptr, float2int8relu(_v0, _v1)); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale0); + int8x8_t v = float2int8relu(_v, _v); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __ARM_NEON + float32x4_t _bias0 = vdupq_n_f32(bias); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; - - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); - float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 8) { - float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + if (elempack == 4) { - float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = _bias0; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + } +#endif // __ARM_NEON - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + bias = bias * scale_out; +#if __ARM_NEON + _bias0 = vmulq_f32(_bias0, _scale_out0); + _bias1 = vmulq_f32(_bias1, _scale_out1); +#endif // __ARM_NEON - float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr[0] = vget_lane_s8(v, 0); - ptr[1] = vget_lane_s8(v, 1); - ptr[2] = vget_lane_s8(v, 2); - ptr[3] = vget_lane_s8(v, 3); - } - } - } + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale0); + _v1 = vfmaq_f32(_bias1, _v1, _scale1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale0); + _v1 = vmlaq_f32(_bias1, _v1, _scale1); +#endif // __aarch64__ + vst1_s8(ptr, float2int8relu(_v0, _v1)); + intptr += 8; + ptr += 8; } - - if (dims == 2) + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; - - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); - - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4); + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias0, _v, _scale0); +#else // __aarch64__ + _v = vmlaq_f32(_bias0, _v, _scale0); +#endif // __aarch64__ + int8x8_t v = float2int8relu(_v, _v); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; + } + } +} - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); +static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount, int elempack) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); + // NCNN_LOGE("requantize_leakyrelu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4); + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __ARM_NEON + float32x4_t _scale_in0 = vdupq_n_f32(scale_in); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); + } + if (elempack == 4) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = _scale_in0; } + } +#endif // __ARM_NEON - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __ARM_NEON + float32x4_t _scale_out0 = vdupq_n_f32(scale_out); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) + { + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); + } + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = _scale_out0; + } + } +#endif // __ARM_NEON - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_in * scale_out; +#if __ARM_NEON + float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); + float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); + float32x4_t _slope = vdupq_n_f32(slope); +#endif // __ARM_NEON - if (activation_type == 1) + if (bias_data_size == 0) + { + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale0); + int8x8_t v = float2int8leakyrelu(_v, _v, _slope); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __ARM_NEON + float32x4_t _bias0 = vdupq_n_f32(bias); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - requantize_relu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); } - - if (activation_type == 2 && activation_params[0] > 0.f) + if (elempack == 4) { - requantize_leakyrelu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = _bias0; } + } +#endif // __ARM_NEON - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmulq_f32(_v0, _scale_in0); - _v1 = vmulq_f32(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); - _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = vmulq_f32(_v0, _scale_out0); - _v1 = vmulq_f32(_v1, _scale_out1); - vst1_s8(ptr, float2int8(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + bias = bias * scale_out; +#if __ARM_NEON + _bias0 = vmulq_f32(_bias0, _scale_out0); + _bias1 = vmulq_f32(_bias1, _scale_out1); +#endif // __ARM_NEON - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale0); + _v1 = vfmaq_f32(_bias1, _v1, _scale1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale0); + _v1 = vmlaq_f32(_bias1, _v1, _scale1); +#endif // __aarch64__ + vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias0, _v, _scale0); +#else // __aarch64__ + _v = vmlaq_f32(_bias0, _v, _scale0); +#endif // __aarch64__ + int8x8_t v = float2int8leakyrelu(_v, _v, _slope); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } +} - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); +static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack) +{ + if (activation_type == 1) + { + requantize_relu(intptr, ptr, scale_in_data, bias_data, scale_out_data, elemcount, elempack); + return; + } - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + if (activation_type == 2 && activation_params[0] > 0.f) + { + const float slope = activation_params[0]; + requantize_leakyrelu(intptr, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount, elempack); + return; + } - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmlaq_f32(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = vmulq_f32(_v, _scale_out); - int8x8_t v = float2int8(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); + // NCNN_LOGE("requantize %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __ARM_NEON + float32x4_t _scale_in0 = vdupq_n_f32(scale_in); + float32x4_t _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = vld1q_f32((const float*)scale_in_data + 4); + } + if (elempack == 4) + { + _scale_in0 = vld1q_f32((const float*)scale_in_data); + _scale_in1 = _scale_in0; } - - return 0; } #endif // __ARM_NEON - if (dims == 1) + float scale_out = scale_out_data[0]; +#if __ARM_NEON + float32x4_t _scale_out0 = vdupq_n_f32(scale_out); + float32x4_t _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - signed char* ptr = top_blob; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) + if (elempack == 8) { - const float scale_in = scale_in_data[0]; - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = vld1q_f32((const float*)scale_out_data + 4); } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 4) { - const float scale_in = scale_in_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + _scale_out0 = vld1q_f32((const float*)scale_out_data); + _scale_out1 = _scale_out0; + } + } +#endif // __ARM_NEON - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); + _v0 = vmulq_f32(_v0, _scale_in0); + _v1 = vmulq_f32(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = vmulq_f32(_v0, _scale_out0); + _v1 = vmulq_f32(_v1, _scale_out1); + vst1_s8(ptr, float2int8(_v0, _v1)); + intptr += 8; + ptr += 8; } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + for (; i + 3 < size; i += 4) { - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale_in0); + _v = activation_ps(_v, activation_type, activation_params); + _v = vmulq_f32(_v, _scale_out0); + int8x8_t v = float2int8(_v, _v); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) +#endif // __ARM_NEON + for (; i < size; i++) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) + float v = *intptr * scale_in; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __ARM_NEON + float32x4_t _bias0 = vdupq_n_f32(bias); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); } - else + if (elempack == 4) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = _bias0; } } +#endif // __ARM_NEON + + int i = 0; +#if __ARM_NEON + for (; i + 7 < size; i += 8) + { + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); +#if __aarch64__ + _v0 = vfmaq_f32(_bias0, _v0, _scale_in0); + _v1 = vfmaq_f32(_bias1, _v1, _scale_in1); +#else // __aarch64__ + _v0 = vmlaq_f32(_bias0, _v0, _scale_in0); + _v1 = vmlaq_f32(_bias1, _v1, _scale_in1); +#endif // __aarch64__ + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = vmulq_f32(_v0, _scale_out0); + _v1 = vmulq_f32(_v1, _scale_out1); + vst1_s8(ptr, float2int8(_v0, _v1)); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); +#if __aarch64__ + _v = vfmaq_f32(_bias0, _v, _scale_in0); +#else // __aarch64__ + _v = vmlaq_f32(_bias0, _v, _scale_in0); +#endif // __aarch64__ + _v = activation_ps(_v, activation_type, activation_params); + _v = vmulq_f32(_v, _scale_out0); + int8x8_t v = float2int8(_v, _v); + ptr[0] = vget_lane_s8(v, 0); + ptr[1] = vget_lane_s8(v, 1); + ptr[2] = vget_lane_s8(v, 2); + ptr[3] = vget_lane_s8(v, 3); + intptr += 4; + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = *intptr * scale_in + bias; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } } +} - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; +int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int i = ii * wp; - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const int* intptr = (const int*)bottom_blob + i * elempack; + signed char* ptr = (signed char*)top_blob + i * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + // assert scale_in_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 + // assert scale_out_data_size == 1 - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const int size = std::min(w - i, wp) * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in + bias; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1); } } - if (dims == 3) + if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); } - else + } + + if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); } } diff --git a/src/layer/arm/requantize_leakyrelu_pack4.h b/src/layer/arm/requantize_leakyrelu_pack4.h deleted file mode 100644 index 98482ac02ad..00000000000 --- a/src/layer/arm/requantize_leakyrelu_pack4.h +++ /dev/null @@ -1,275 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v02 = vcvtq_f32_s32(vld1q_s32(intptr0 + 8)); - float32x4_t _v03 = vcvtq_f32_s32(vld1q_s32(intptr0 + 12)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); - float32x4_t _v12 = vcvtq_f32_s32(vld1q_s32(intptr1 + 8)); - float32x4_t _v13 = vcvtq_f32_s32(vld1q_s32(intptr1 + 12)); - _v00 = vmulq_f32(_v00, _scale0); - _v01 = vmulq_f32(_v01, _scale0); - _v02 = vmulq_f32(_v02, _scale0); - _v03 = vmulq_f32(_v03, _scale0); - _v10 = vmulq_f32(_v10, _scale1); - _v11 = vmulq_f32(_v11, _scale1); - _v12 = vmulq_f32(_v12, _scale1); - _v13 = vmulq_f32(_v13, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v00, _v10, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v01, _v11, _slope)); - vst1_s8(ptr + 16, float2int8leakyrelu(_v02, _v12, _slope)); - vst1_s8(ptr + 24, float2int8leakyrelu(_v03, _v13, _slope)); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } -#endif // __aarch64__ - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - _bias0 = vmulq_f32(_bias0, _scale_out0); - _bias1 = vmulq_f32(_bias1, _scale_out1); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v02 = vcvtq_f32_s32(vld1q_s32(intptr0 + 8)); - float32x4_t _v03 = vcvtq_f32_s32(vld1q_s32(intptr0 + 12)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); - float32x4_t _v12 = vcvtq_f32_s32(vld1q_s32(intptr1 + 8)); - float32x4_t _v13 = vcvtq_f32_s32(vld1q_s32(intptr1 + 12)); - _v00 = vfmaq_f32(_bias0, _v00, _scale0); - _v01 = vfmaq_f32(_bias0, _v01, _scale0); - _v02 = vfmaq_f32(_bias0, _v02, _scale0); - _v03 = vfmaq_f32(_bias0, _v03, _scale0); - _v10 = vfmaq_f32(_bias1, _v10, _scale1); - _v11 = vfmaq_f32(_bias1, _v11, _scale1); - _v12 = vfmaq_f32(_bias1, _v12, _scale1); - _v13 = vfmaq_f32(_bias1, _v13, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v00, _v10, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v01, _v11, _slope)); - vst1_s8(ptr + 16, float2int8leakyrelu(_v02, _v12, _slope)); - vst1_s8(ptr + 24, float2int8leakyrelu(_v03, _v13, _slope)); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); -#if __aarch64__ - _v00 = vfmaq_f32(_bias0, _v00, _scale0); - _v01 = vfmaq_f32(_bias0, _v01, _scale0); - _v10 = vfmaq_f32(_bias1, _v10, _scale1); - _v11 = vfmaq_f32(_bias1, _v11, _scale1); -#else // __aarch64__ - _v00 = vmlaq_f32(_bias0, _v00, _scale0); - _v01 = vmlaq_f32(_bias0, _v01, _scale0); - _v10 = vmlaq_f32(_bias1, _v10, _scale1); - _v11 = vmlaq_f32(_bias1, _v11, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8leakyrelu(_v00, _v10, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v01, _v11, _slope)); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); - - float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - int8x8_t v = float2int8leakyrelu(_v, _v, _slope); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); - - float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); - _bias = vmulq_f32(_bias, _scale_out); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - int8x8_t v = float2int8leakyrelu(_v, _v, _slope); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/arm/requantize_leakyrelu_pack8.h b/src/layer/arm/requantize_leakyrelu_pack8.h deleted file mode 100644 index 9efee4dd36d..00000000000 --- a/src/layer/arm/requantize_leakyrelu_pack8.h +++ /dev/null @@ -1,202 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - float32x4_t _v4 = vcvtq_f32_s32(vld1q_s32(intptr + 16)); - float32x4_t _v5 = vcvtq_f32_s32(vld1q_s32(intptr + 20)); - float32x4_t _v6 = vcvtq_f32_s32(vld1q_s32(intptr + 24)); - float32x4_t _v7 = vcvtq_f32_s32(vld1q_s32(intptr + 28)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - _v2 = vmulq_f32(_v2, _scale0); - _v3 = vmulq_f32(_v3, _scale1); - _v4 = vmulq_f32(_v4, _scale0); - _v5 = vmulq_f32(_v5, _scale1); - _v6 = vmulq_f32(_v6, _scale0); - _v7 = vmulq_f32(_v7, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v2, _v3, _slope)); - vst1_s8(ptr + 16, float2int8leakyrelu(_v4, _v5, _slope)); - vst1_s8(ptr + 24, float2int8leakyrelu(_v6, _v7, _slope)); - - intptr += 32; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - _v2 = vmulq_f32(_v2, _scale0); - _v3 = vmulq_f32(_v3, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v2, _v3, _slope)); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - _bias0 = vmulq_f32(_bias0, _scale_out0); - _bias1 = vmulq_f32(_bias1, _scale_out1); - float32x4_t _slope = vdupq_n_f32(slope); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - float32x4_t _v4 = vcvtq_f32_s32(vld1q_s32(intptr + 16)); - float32x4_t _v5 = vcvtq_f32_s32(vld1q_s32(intptr + 20)); - float32x4_t _v6 = vcvtq_f32_s32(vld1q_s32(intptr + 24)); - float32x4_t _v7 = vcvtq_f32_s32(vld1q_s32(intptr + 28)); - - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - _v2 = vfmaq_f32(_bias0, _v2, _scale0); - _v3 = vfmaq_f32(_bias1, _v3, _scale1); - _v4 = vfmaq_f32(_bias0, _v4, _scale0); - _v5 = vfmaq_f32(_bias1, _v5, _scale1); - _v6 = vfmaq_f32(_bias0, _v6, _scale0); - _v7 = vfmaq_f32(_bias1, _v7, _scale1); - - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v2, _v3, _slope)); - vst1_s8(ptr + 16, float2int8leakyrelu(_v4, _v5, _slope)); - vst1_s8(ptr + 24, float2int8leakyrelu(_v6, _v7, _slope)); - - intptr += 32; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - _v2 = vfmaq_f32(_bias0, _v2, _scale0); - _v3 = vfmaq_f32(_bias1, _v3, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); - _v2 = vmlaq_f32(_bias0, _v2, _scale0); - _v3 = vmlaq_f32(_bias1, _v3, _scale1); -#endif // __aarch64__ - - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - vst1_s8(ptr + 8, float2int8leakyrelu(_v2, _v3, _slope)); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope)); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/arm/requantize_relu_pack4.h b/src/layer/arm/requantize_relu_pack4.h deleted file mode 100644 index 961562a3184..00000000000 --- a/src/layer/arm/requantize_relu_pack4.h +++ /dev/null @@ -1,271 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v02 = vcvtq_f32_s32(vld1q_s32(intptr0 + 8)); - float32x4_t _v03 = vcvtq_f32_s32(vld1q_s32(intptr0 + 12)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); - float32x4_t _v12 = vcvtq_f32_s32(vld1q_s32(intptr1 + 8)); - float32x4_t _v13 = vcvtq_f32_s32(vld1q_s32(intptr1 + 12)); - _v00 = vmulq_f32(_v00, _scale0); - _v01 = vmulq_f32(_v01, _scale0); - _v02 = vmulq_f32(_v02, _scale0); - _v03 = vmulq_f32(_v03, _scale0); - _v10 = vmulq_f32(_v10, _scale1); - _v11 = vmulq_f32(_v11, _scale1); - _v12 = vmulq_f32(_v12, _scale1); - _v13 = vmulq_f32(_v13, _scale1); - vst1_s8(ptr, float2int8relu(_v00, _v10)); - vst1_s8(ptr + 8, float2int8relu(_v01, _v11)); - vst1_s8(ptr + 16, float2int8relu(_v02, _v12)); - vst1_s8(ptr + 24, float2int8relu(_v03, _v13)); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } -#endif // __aarch64__ - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_s8(ptr, float2int8relu(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - _bias0 = vmulq_f32(_bias0, _scale_out0); - _bias1 = vmulq_f32(_bias1, _scale_out1); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v02 = vcvtq_f32_s32(vld1q_s32(intptr0 + 8)); - float32x4_t _v03 = vcvtq_f32_s32(vld1q_s32(intptr0 + 12)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); - float32x4_t _v12 = vcvtq_f32_s32(vld1q_s32(intptr1 + 8)); - float32x4_t _v13 = vcvtq_f32_s32(vld1q_s32(intptr1 + 12)); - _v00 = vfmaq_f32(_bias0, _v00, _scale0); - _v01 = vfmaq_f32(_bias0, _v01, _scale0); - _v02 = vfmaq_f32(_bias0, _v02, _scale0); - _v03 = vfmaq_f32(_bias0, _v03, _scale0); - _v10 = vfmaq_f32(_bias1, _v10, _scale1); - _v11 = vfmaq_f32(_bias1, _v11, _scale1); - _v12 = vfmaq_f32(_bias1, _v12, _scale1); - _v13 = vfmaq_f32(_bias1, _v13, _scale1); - vst1_s8(ptr, float2int8relu(_v00, _v10)); - vst1_s8(ptr + 8, float2int8relu(_v01, _v11)); - vst1_s8(ptr + 16, float2int8relu(_v02, _v12)); - vst1_s8(ptr + 24, float2int8relu(_v03, _v13)); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v00 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v01 = vcvtq_f32_s32(vld1q_s32(intptr0 + 4)); - float32x4_t _v10 = vcvtq_f32_s32(vld1q_s32(intptr1)); - float32x4_t _v11 = vcvtq_f32_s32(vld1q_s32(intptr1 + 4)); -#if __aarch64__ - _v00 = vfmaq_f32(_bias0, _v00, _scale0); - _v01 = vfmaq_f32(_bias0, _v01, _scale0); - _v10 = vfmaq_f32(_bias1, _v10, _scale1); - _v11 = vfmaq_f32(_bias1, _v11, _scale1); -#else // __aarch64__ - _v00 = vmlaq_f32(_bias0, _v00, _scale0); - _v01 = vmlaq_f32(_bias0, _v01, _scale0); - _v10 = vmlaq_f32(_bias1, _v10, _scale1); - _v11 = vmlaq_f32(_bias1, _v11, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8relu(_v00, _v10)); - vst1_s8(ptr + 8, float2int8relu(_v01, _v11)); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8relu(_v0, _v1)); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); - - float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); - - int i = 0; - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - int8x8_t v = float2int8relu(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); - float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); - - float32x4_t _scale = vmulq_f32(_scale_in, _scale_out); - _bias = vmulq_f32(_bias, _scale_out); - - int i = 0; - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - int8x8_t v = float2int8relu(_v, _v); - ptr0[0] = vget_lane_s8(v, 0); - ptr1[0] = vget_lane_s8(v, 1); - ptr2[0] = vget_lane_s8(v, 2); - ptr3[0] = vget_lane_s8(v, 3); - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/arm/requantize_relu_pack8.h b/src/layer/arm/requantize_relu_pack8.h deleted file mode 100644 index e93ec83ba3f..00000000000 --- a/src/layer/arm/requantize_relu_pack8.h +++ /dev/null @@ -1,200 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - float32x4_t _v4 = vcvtq_f32_s32(vld1q_s32(intptr + 16)); - float32x4_t _v5 = vcvtq_f32_s32(vld1q_s32(intptr + 20)); - float32x4_t _v6 = vcvtq_f32_s32(vld1q_s32(intptr + 24)); - float32x4_t _v7 = vcvtq_f32_s32(vld1q_s32(intptr + 28)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - _v2 = vmulq_f32(_v2, _scale0); - _v3 = vmulq_f32(_v3, _scale1); - _v4 = vmulq_f32(_v4, _scale0); - _v5 = vmulq_f32(_v5, _scale1); - _v6 = vmulq_f32(_v6, _scale0); - _v7 = vmulq_f32(_v7, _scale1); - vst1_s8(ptr, float2int8relu(_v0, _v1)); - vst1_s8(ptr + 8, float2int8relu(_v2, _v3)); - vst1_s8(ptr + 16, float2int8relu(_v4, _v5)); - vst1_s8(ptr + 24, float2int8relu(_v6, _v7)); - - intptr += 32; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - _v2 = vmulq_f32(_v2, _scale0); - _v3 = vmulq_f32(_v3, _scale1); - vst1_s8(ptr, float2int8relu(_v0, _v1)); - vst1_s8(ptr + 8, float2int8relu(_v2, _v3)); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_s8(ptr, float2int8relu(_v0, _v1)); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8); - float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4); - float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8); - float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0); - float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1); - _bias0 = vmulq_f32(_bias0, _scale_out0); - _bias1 = vmulq_f32(_bias1, _scale_out1); - - int i = 0; -#if __aarch64__ - for (; i + 3 < size; i += 4) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - float32x4_t _v4 = vcvtq_f32_s32(vld1q_s32(intptr + 16)); - float32x4_t _v5 = vcvtq_f32_s32(vld1q_s32(intptr + 20)); - float32x4_t _v6 = vcvtq_f32_s32(vld1q_s32(intptr + 24)); - float32x4_t _v7 = vcvtq_f32_s32(vld1q_s32(intptr + 28)); - - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - _v2 = vfmaq_f32(_bias0, _v2, _scale0); - _v3 = vfmaq_f32(_bias1, _v3, _scale1); - _v4 = vfmaq_f32(_bias0, _v4, _scale0); - _v5 = vfmaq_f32(_bias1, _v5, _scale1); - _v6 = vfmaq_f32(_bias0, _v6, _scale0); - _v7 = vfmaq_f32(_bias1, _v7, _scale1); - - vst1_s8(ptr, float2int8relu(_v0, _v1)); - vst1_s8(ptr + 8, float2int8relu(_v2, _v3)); - vst1_s8(ptr + 16, float2int8relu(_v4, _v5)); - vst1_s8(ptr + 24, float2int8relu(_v6, _v7)); - - intptr += 32; - ptr += 32; - } -#endif // __aarch64__ - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - _v2 = vfmaq_f32(_bias0, _v2, _scale0); - _v3 = vfmaq_f32(_bias1, _v3, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); - _v2 = vmlaq_f32(_bias0, _v2, _scale0); - _v3 = vmlaq_f32(_bias1, _v3, _scale1); -#endif // __aarch64__ - - vst1_s8(ptr, float2int8relu(_v0, _v1)); - vst1_s8(ptr + 8, float2int8relu(_v2, _v3)); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else // __aarch64__ - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif // __aarch64__ - vst1_s8(ptr, float2int8relu(_v0, _v1)); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/loongarch/requantize_leakyrelu_pack4.h b/src/layer/loongarch/requantize_leakyrelu_pack4.h deleted file mode 100644 index d6b49942660..00000000000 --- a/src/layer/loongarch/requantize_leakyrelu_pack4.h +++ /dev/null @@ -1,271 +0,0 @@ -// yala is pleased to support the open source community by making ncnn available. -// -// -// Copyright (C) 2022 yala ;. All rights reserved. -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); - __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); - __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); - _v00 = __lsx_vfmul_s(_v00, _scale0); - _v01 = __lsx_vfmul_s(_v01, _scale0); - _v02 = __lsx_vfmul_s(_v02, _scale0); - _v03 = __lsx_vfmul_s(_v03, _scale0); - _v10 = __lsx_vfmul_s(_v10, _scale1); - _v11 = __lsx_vfmul_s(_v11, _scale1); - _v12 = __lsx_vfmul_s(_v12, _scale1); - _v13 = __lsx_vfmul_s(_v13, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); - _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); - __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); - __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); - _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); - _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); - _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); - _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); - _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); - _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); - _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); - _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr0 + 32); - __builtin_prefetch(intptr1 + 32); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); - _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); - _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); - _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - signed char* vp; - - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); - - __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __m128i v = float2int8leakyrelu(_v, _slope); - vp = (signed char*)&v; - ptr0[0] = vp[0]; - ptr1[0] = vp[1]; - ptr2[0] = vp[2]; - ptr3[0] = vp[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - signed char* vp; - - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); - - __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); - _bias = __lsx_vfmul_s(_bias, _scale_out); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __m128i v = float2int8leakyrelu(_v, _slope); - vp = (signed char*)&v; - ptr0[0] = vp[0]; - ptr1[0] = vp[1]; - ptr2[0] = vp[2]; - ptr3[0] = vp[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/loongarch/requantize_leakyrelu_pack8.h b/src/layer/loongarch/requantize_leakyrelu_pack8.h deleted file mode 100644 index a2c4faed4f2..00000000000 --- a/src/layer/loongarch/requantize_leakyrelu_pack8.h +++ /dev/null @@ -1,188 +0,0 @@ -// yala is pleased to support the open source community by making ncnn available. -// -// -// Copyright (C) 2022 yala ;. All rights reserved. -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); - __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); - __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); - __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - _v2 = __lsx_vfmul_s(_v2, _scale0); - _v3 = __lsx_vfmul_s(_v3, _scale1); - _v4 = __lsx_vfmul_s(_v4, _scale0); - _v5 = __lsx_vfmul_s(_v5, _scale1); - _v6 = __lsx_vfmul_s(_v6, _scale0); - _v7 = __lsx_vfmul_s(_v7, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - _v2 = __lsx_vfmul_s(_v2, _scale0); - _v3 = __lsx_vfmul_s(_v3, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); - _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); - __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); - __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); - __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); - __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); - _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); - _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); - _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); - _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); - _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); - _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp index 3399ac096b6..9b7f2130cf4 100644 --- a/src/layer/loongarch/requantize_loongarch.cpp +++ b/src/layer/loongarch/requantize_loongarch.cpp @@ -23,13 +23,6 @@ namespace ncnn { -#if __loongarch_sx -#include "requantize_leakyrelu_pack4.h" -#include "requantize_leakyrelu_pack8.h" -#include "requantize_relu_pack4.h" -#include "requantize_relu_pack8.h" -#endif // __loongarch_sx - Requantize_loongarch::Requantize_loongarch() { #if __loongarch_sx @@ -37,1344 +30,567 @@ Requantize_loongarch::Requantize_loongarch() #endif } -int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; - -#if __loongarch_sx - if (elempack == 8) - { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); - __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in); - _v1 = __lsx_vfmul_s(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { - __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // NCNN_LOGE("requantize_relu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in); - _v1 = __lsx_vfmul_s(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { - __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out); - _v1 = __lsx_vfmul_s(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } + float scale_in = scale_in_data[0]; +#if __loongarch_sx + __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in); + __m128 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0); } - - if (dims == 2) + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } + } +#endif // __loongarch_sx - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __loongarch_sx + __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out); + __m128 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (activation_type == 1) - { - requantize_relu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; - } - - if (activation_type == 2 && activation_params[0] > 0.f) - { - requantize_leakyrelu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; - } - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0); + } + if (elempack == 4) + { + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __loongarch_sx - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); + float scale = scale_in * scale_out; +#if __loongarch_sx + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); +#endif // __loongarch_sx - intptr += 8; - ptr += 8; - } - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale0); + v16i8 v = (v16i8)float2int8relu(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __loongarch_sx + __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias); + __m128 _bias1 = _bias0; + if (bias_data_size > 1) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; - - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); - __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 8) { - __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0); } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + if (elempack == 4) { - __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = _bias0; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + } +#endif // __loongarch_sx - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + bias = bias * scale_out; +#if __loongarch_sx + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); +#endif // __loongarch_sx - __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - } + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_v0, _scale0, _bias0); + _v1 = __lsx_vfmadd_s(_v1, _scale1, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + intptr += 8; + ptr += 8; } - - if (dims == 2) + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; - - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); - - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_v, _scale0, _bias0); + v16i8 v = (v16i8)float2int8relu(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; + } + } +} - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; +static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount, int elempack) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); + // NCNN_LOGE("requantize_leakyrelu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __loongarch_sx + __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in); + __m128 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0); + } + if (elempack == 4) + { + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } + } +#endif // __loongarch_sx - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __loongarch_sx + __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out); + __m128 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) + { + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0); + } + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __loongarch_sx - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_in * scale_out; +#if __loongarch_sx + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); +#endif // __loongarch_sx - if (activation_type == 1) + if (bias_data_size == 0) + { + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale0); + v16i8 v = (v16i8)float2int8leakyrelu(_v, _slope); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __loongarch_sx + __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias); + __m128 _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - requantize_relu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0); } - - if (activation_type == 2 && activation_params[0] > 0.f) + if (elempack == 4) { - requantize_leakyrelu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = _bias0; } + } +#endif // __loongarch_sx - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale_in0); - _v1 = __lsx_vfmul_s(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __lsx_vfmul_s(_v0, _scale_out0); - _v1 = __lsx_vfmul_s(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + bias = bias * scale_out; +#if __loongarch_sx + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); +#endif // __loongarch_sx - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_v0, _scale0, _bias0); + _v1 = __lsx_vfmadd_s(_v1, _scale1, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_v, _scale0, _bias0); + v16i8 v = (v16i8)float2int8leakyrelu(_v, _slope); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } +} - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; +static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack) +{ + if (activation_type == 1) + { + requantize_relu(intptr, ptr, scale_in_data, bias_data, scale_out_data, elemcount, elempack); + return; + } - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + if (activation_type == 2 && activation_params[0] > 0.f) + { + const float slope = activation_params[0]; + requantize_leakyrelu(intptr, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount, elempack); + return; + } - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale_in, _v, _bias); - _v = activation_ps(_v, activation_type, activation_params); - _v = __lsx_vfmul_s(_v, _scale_out); - v16i8 v = (v16i8)float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; + // NCNN_LOGE("requantize %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __loongarch_sx + __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in); + __m128 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0); + } + if (elempack == 4) + { + _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } - - return 0; } #endif // __loongarch_sx - if (dims == 1) + float scale_out = scale_out_data[0]; +#if __loongarch_sx + __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out); + __m128 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - signed char* ptr = top_blob; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) + if (elempack == 8) { - const float scale_in = scale_in_data[0]; - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0); } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 4) { - const float scale_in = scale_in_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __loongarch_sx - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + intptr += 8; + ptr += 8; } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + for (; i + 3 < size; i += 4) { - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in0); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out0); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) +#endif // __loongarch_sx + for (; i < size; i++) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) + float v = *intptr * scale_in; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __loongarch_sx + __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias); + __m128 _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0); } - else + if (elempack == 4) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0); + _bias1 = _bias0; } } +#endif // __loongarch_sx + + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_v0, _scale_in0, _bias0); + _v1 = __lsx_vfmadd_s(_v1, _scale_in1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_v, _scale_in0, _bias0); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out0); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + float v = *intptr * scale_in + bias; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } } +} - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; +int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int i = ii * wp; - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const int* intptr = (const int*)bottom_blob + i * elempack; + signed char* ptr = (signed char*)top_blob + i * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + // assert scale_in_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 + // assert scale_out_data_size == 1 - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const int size = std::min(w - i, wp) * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in + bias; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1); } } - if (dims == 3) + if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); } - else + } + + if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); } } diff --git a/src/layer/loongarch/requantize_relu_pack4.h b/src/layer/loongarch/requantize_relu_pack4.h deleted file mode 100644 index 2fba8dfc2e4..00000000000 --- a/src/layer/loongarch/requantize_relu_pack4.h +++ /dev/null @@ -1,267 +0,0 @@ -// yala is pleased to support the open source community by making ncnn available. -// -// -// Copyright (C) 2022 yala ;. All rights reserved. -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); - __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); - __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); - _v00 = __lsx_vfmul_s(_v00, _scale0); - _v01 = __lsx_vfmul_s(_v01, _scale0); - _v02 = __lsx_vfmul_s(_v02, _scale0); - _v03 = __lsx_vfmul_s(_v03, _scale0); - _v10 = __lsx_vfmul_s(_v10, _scale1); - _v11 = __lsx_vfmul_s(_v11, _scale1); - _v12 = __lsx_vfmul_s(_v12, _scale1); - _v13 = __lsx_vfmul_s(_v13, _scale1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); - *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); - _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); - __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); - __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); - _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); - _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); - _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); - _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); - _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); - _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); - _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); - _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); - *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr0 + 32); - __builtin_prefetch(intptr1 + 32); - __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); - __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); - _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); - _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); - _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); - _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - signed char* vp; - - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); - - __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __m128i v = float2int8relu(_v); - vp = (signed char*)&v; - ptr0[0] = vp[0]; - ptr1[0] = vp[1]; - ptr2[0] = vp[2]; - ptr3[0] = vp[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - signed char* vp; - - __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); - __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); - - __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); - _bias = __lsx_vfmul_s(_bias, _scale_out); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __m128i v = float2int8relu(_v); - vp = (signed char*)&v; - ptr0[0] = vp[0]; - ptr1[0] = vp[1]; - ptr2[0] = vp[2]; - ptr3[0] = vp[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/loongarch/requantize_relu_pack8.h b/src/layer/loongarch/requantize_relu_pack8.h deleted file mode 100644 index 3d2a45b45d0..00000000000 --- a/src/layer/loongarch/requantize_relu_pack8.h +++ /dev/null @@ -1,186 +0,0 @@ -// yala is pleased to support the open source community by making ncnn available. -// -// -// Copyright (C) 2022 yala ;. All rights reserved. -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); - __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); - __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); - __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - _v2 = __lsx_vfmul_s(_v2, _scale0); - _v3 = __lsx_vfmul_s(_v3, _scale1); - _v4 = __lsx_vfmul_s(_v4, _scale0); - _v5 = __lsx_vfmul_s(_v5, _scale1); - _v6 = __lsx_vfmul_s(_v6, _scale0); - _v7 = __lsx_vfmul_s(_v7, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); - *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - _v2 = __lsx_vfmul_s(_v2, _scale0); - _v3 = __lsx_vfmul_s(_v3, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); - __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); - __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); - __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); - __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); - _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); - _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); - __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); - __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); - __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); - _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); - _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); - _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); - _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); - _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); - *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); - _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/mips/requantize_leakyrelu_pack4.h b/src/layer/mips/requantize_leakyrelu_pack4.h deleted file mode 100644 index 89bc14bd08b..00000000000 --- a/src/layer/mips/requantize_leakyrelu_pack4.h +++ /dev/null @@ -1,267 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v02 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 8, 0)); - v4f32 _v03 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 12, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - v4f32 _v12 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 8, 0)); - v4f32 _v13 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 12, 0)); - _v00 = __msa_fmul_w(_v00, _scale0); - _v01 = __msa_fmul_w(_v01, _scale0); - _v02 = __msa_fmul_w(_v02, _scale0); - _v03 = __msa_fmul_w(_v03, _scale0); - _v10 = __msa_fmul_w(_v10, _scale1); - _v11 = __msa_fmul_w(_v11, _scale1); - _v12 = __msa_fmul_w(_v12, _scale1); - _v13 = __msa_fmul_w(_v13, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - _bias0 = __msa_fmul_w(_bias0, _scale_out0); - _bias1 = __msa_fmul_w(_bias1, _scale_out1); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v02 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 8, 0)); - v4f32 _v03 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 12, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - v4f32 _v12 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 8, 0)); - v4f32 _v13 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 12, 0)); - _v00 = __msa_fmadd_w(_bias0, _v00, _scale0); - _v01 = __msa_fmadd_w(_bias0, _v01, _scale0); - _v02 = __msa_fmadd_w(_bias0, _v02, _scale0); - _v03 = __msa_fmadd_w(_bias0, _v03, _scale0); - _v10 = __msa_fmadd_w(_bias1, _v10, _scale1); - _v11 = __msa_fmadd_w(_bias1, _v11, _scale1); - _v12 = __msa_fmadd_w(_bias1, _v12, _scale1); - _v13 = __msa_fmadd_w(_bias1, _v13, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr0 + 32); - __builtin_prefetch(intptr1 + 32); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - _v00 = __msa_fmadd_w(_bias0, _v00, _scale0); - _v01 = __msa_fmadd_w(_bias0, _v01, _scale0); - _v10 = __msa_fmadd_w(_bias1, _v10, _scale1); - _v11 = __msa_fmadd_w(_bias1, _v11, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); - - v4f32 _scale = __msa_fmul_w(_scale_in, _scale_out); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - v16i8 v = float2int8leakyrelu(_v, _slope); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 4, 0); - - v4f32 _scale = __msa_fmul_w(_scale_in, _scale_out); - _bias = __msa_fmul_w(_bias, _scale_out); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - v16i8 v = float2int8leakyrelu(_v, _slope); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/mips/requantize_leakyrelu_pack8.h b/src/layer/mips/requantize_leakyrelu_pack8.h deleted file mode 100644 index f7968c9df70..00000000000 --- a/src/layer/mips/requantize_leakyrelu_pack8.h +++ /dev/null @@ -1,188 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_leakyrelu_pack8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(leakyrelu(v * scale_in, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out), slope) - - // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) - // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - v4f32 _v4 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 16, 0)); - v4f32 _v5 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 20, 0)); - v4f32 _v6 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 24, 0)); - v4f32 _v7 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 28, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - _v2 = __msa_fmul_w(_v2, _scale0); - _v3 = __msa_fmul_w(_v3, _scale1); - _v4 = __msa_fmul_w(_v4, _scale0); - _v5 = __msa_fmul_w(_v5, _scale1); - _v6 = __msa_fmul_w(_v6, _scale0); - _v7 = __msa_fmul_w(_v7, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - _v2 = __msa_fmul_w(_v2, _scale0); - _v3 = __msa_fmul_w(_v3, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - _bias0 = __msa_fmul_w(_bias0, _scale_out0); - _bias1 = __msa_fmul_w(_bias1, _scale_out1); - v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - v4f32 _v4 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 16, 0)); - v4f32 _v5 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 20, 0)); - v4f32 _v6 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 24, 0)); - v4f32 _v7 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 28, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - _v2 = __msa_fmadd_w(_bias0, _v2, _scale0); - _v3 = __msa_fmadd_w(_bias1, _v3, _scale1); - _v4 = __msa_fmadd_w(_bias0, _v4, _scale0); - _v5 = __msa_fmadd_w(_bias1, _v5, _scale1); - _v6 = __msa_fmadd_w(_bias0, _v6, _scale0); - _v7 = __msa_fmadd_w(_bias1, _v7, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); - *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - _v2 = __msa_fmadd_w(_bias0, _v2, _scale0); - _v3 = __msa_fmadd_w(_bias1, _v3, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/mips/requantize_mips.cpp b/src/layer/mips/requantize_mips.cpp index 44e55f89477..9c502362d3a 100644 --- a/src/layer/mips/requantize_mips.cpp +++ b/src/layer/mips/requantize_mips.cpp @@ -23,13 +23,6 @@ namespace ncnn { -#if __mips_msa -#include "requantize_leakyrelu_pack4.h" -#include "requantize_leakyrelu_pack8.h" -#include "requantize_relu_pack4.h" -#include "requantize_relu_pack8.h" -#endif // __mips_msa - Requantize_mips::Requantize_mips() { #if __mips_msa @@ -37,1344 +30,567 @@ Requantize_mips::Requantize_mips() #endif } -int Requantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; - -#if __mips_msa - if (elempack == 8) - { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]); - v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in); - _v1 = __msa_fmul_w(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale_in); - _v1 = __msa_fmadd_w(_bias, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { - v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // NCNN_LOGE("requantize_relu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in); - _v1 = __msa_fmul_w(_v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale_in); - _v1 = __msa_fmadd_w(_bias, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { - v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out); - _v1 = __msa_fmul_w(_v1, _scale_out); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - } - } - } + float scale_in = scale_in_data[0]; +#if __mips_msa + v4f32 _scale_in0 = (v4f32)__msa_fill_w_f32(scale_in); + v4f32 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = (v4f32)__msa_ld_w((const float*)scale_in_data + 4, 0); } - - if (dims == 2) + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } + } +#endif // __mips_msa - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __mips_msa + v4f32 _scale_out0 = (v4f32)__msa_fill_w_f32(scale_out); + v4f32 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (activation_type == 1) - { - requantize_relu_pack8_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; - } - - if (activation_type == 2 && activation_params[0] > 0.f) - { - requantize_leakyrelu_pack8_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; - } - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = (v4f32)__msa_ld_w((const float*)scale_out_data + 4, 0); + } + if (elempack == 4) + { + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __mips_msa - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); + float scale = scale_in * scale_out; +#if __mips_msa + v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); + v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); +#endif // __mips_msa - intptr += 8; - ptr += 8; - } - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmul_w(_v0, _scale0); + _v1 = __msa_fmul_w(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmul_w(_v, _scale0); + v16i8 v = float2int8relu(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __mips_msa + v4f32 _bias0 = (v4f32)__msa_fill_w_f32(bias); + v4f32 _bias1 = _bias0; + if (bias_data_size > 1) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; - - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]); - v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 8) { - v4f32 _scale_in = (v4f32)__msa_fill_w_f32(scale_in_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = (v4f32)__msa_ld_w((const float*)bias_data + 4, 0); } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + if (elempack == 4) { - v4f32 _scale_out = (v4f32)__msa_fill_w_f32(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = _bias0; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + } +#endif // __mips_msa - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; + bias = bias * scale_out; +#if __mips_msa + _bias0 = __msa_fmul_w(_bias0, _scale_out0); + _bias1 = __msa_fmul_w(_bias1, _scale_out1); +#endif // __mips_msa - v4f32 _scale_in = (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _scale_out = (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr[0] = v[0]; - ptr[1] = v[1]; - ptr[2] = v[2]; - ptr[3] = v[3]; - } - } - } + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); + _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + intptr += 8; + ptr += 8; } - - if (dims == 2) + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; - - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); - - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmadd_w(_bias0, _v, _scale0); + v16i8 v = float2int8relu(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr = 0; + intptr++; + ptr++; + } + } +} - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; +static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount, int elempack) +{ + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); + // NCNN_LOGE("requantize_leakyrelu %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + i * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + i * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __mips_msa + v4f32 _scale_in0 = (v4f32)__msa_fill_w_f32(scale_in); + v4f32 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = (v4f32)__msa_ld_w((const float*)scale_in_data + 4, 0); + } + if (elempack == 4) + { + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } + } +#endif // __mips_msa - if (dims == 3) + float scale_out = scale_out_data[0]; +#if __mips_msa + v4f32 _scale_out0 = (v4f32)__msa_fill_w_f32(scale_out); + v4f32 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) + { + if (elempack == 8) + { + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = (v4f32)__msa_ld_w((const float*)scale_out_data + 4, 0); + } + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __mips_msa - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + float scale = scale_in * scale_out; +#if __mips_msa + v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); + v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); + v4f32 _slope = (v4f32)__msa_fill_w_f32(slope); +#endif // __mips_msa - if (activation_type == 1) + if (bias_data_size == 0) + { + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmul_w(_v0, _scale0); + _v1 = __msa_fmul_w(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmul_w(_v, _scale0); + v16i8 v = float2int8leakyrelu(_v, _slope); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *intptr * scale; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __mips_msa + v4f32 _bias0 = (v4f32)__msa_fill_w_f32(bias); + v4f32 _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - requantize_relu_pack4_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); - return 0; + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = (v4f32)__msa_ld_w((const float*)bias_data + 4, 0); } - - if (activation_type == 2 && activation_params[0] > 0.f) + if (elempack == 4) { - requantize_leakyrelu_pack4_msa(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); - return 0; + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = _bias0; } + } +#endif // __mips_msa - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmul_w(_v0, _scale_in0); - _v1 = __msa_fmul_w(_v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); - _v0 = activation_ps(_v0, activation_type, activation_params); - _v1 = activation_ps(_v1, activation_type, activation_params); - _v0 = __msa_fmul_w(_v0, _scale_out0); - _v1 = __msa_fmul_w(_v1, _scale_out1); - *((int64_t*)ptr) = float2int8(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + bias = bias * scale_out; +#if __mips_msa + _bias0 = __msa_fmul_w(_bias0, _scale_out0); + _bias1 = __msa_fmul_w(_bias1, _scale_out1); +#endif // __mips_msa - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); + _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmadd_w(_bias0, _v, _scale0); + v16i8 v = float2int8leakyrelu(_v, _slope); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *intptr * scale + bias; + *ptr = float2int8(v); + if (*ptr < 0) *ptr *= slope; + intptr++; + ptr++; + } + } +} - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; +static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack) +{ + if (activation_type == 1) + { + requantize_relu(intptr, ptr, scale_in_data, bias_data, scale_out_data, elemcount, elempack); + return; + } - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); + if (activation_type == 2 && activation_params[0] > 0.f) + { + const float slope = activation_params[0]; + requantize_leakyrelu(intptr, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount, elempack); + return; + } - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 4, 0); + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; - for (int i = 0; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale_in); - _v = activation_ps(_v, activation_type, activation_params); - _v = __msa_fmul_w(_v, _scale_out); - v16i8 v = float2int8(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; + // NCNN_LOGE("requantize %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } + float scale_in = scale_in_data[0]; +#if __mips_msa + v4f32 _scale_in0 = (v4f32)__msa_fill_w_f32(scale_in); + v4f32 _scale_in1 = _scale_in0; + if (scale_in_data_size > 1) + { + if (elempack == 8) + { + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = (v4f32)__msa_ld_w((const float*)scale_in_data + 4, 0); + } + if (elempack == 4) + { + _scale_in0 = (v4f32)__msa_ld_w((const float*)scale_in_data, 0); + _scale_in1 = _scale_in0; } - - return 0; } #endif // __mips_msa - if (dims == 1) + float scale_out = scale_out_data[0]; +#if __mips_msa + v4f32 _scale_out0 = (v4f32)__msa_fill_w_f32(scale_out); + v4f32 _scale_out1 = _scale_out0; + if (scale_out_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)1u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - signed char* ptr = top_blob; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) + if (elempack == 8) { - const float scale_in = scale_in_data[0]; - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = (v4f32)__msa_ld_w((const float*)scale_out_data + 4, 0); } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) + if (elempack == 4) { - const float scale_in = scale_in_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + _scale_out0 = (v4f32)__msa_ld_w((const float*)scale_out_data, 0); + _scale_out1 = _scale_out0; + } + } +#endif // __mips_msa - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } + if (bias_data_size == 0) + { + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmul_w(_v0, _scale_in0); + _v1 = __msa_fmul_w(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __msa_fmul_w(_v0, _scale_out0); + _v1 = __msa_fmul_w(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + intptr += 8; + ptr += 8; } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) + for (; i + 3 < size; i += 4) { - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmul_w(_v, _scale_in0); + _v = activation_ps(_v, activation_type, activation_params); + _v = __msa_fmul_w(_v, _scale_out0); + v16i8 v = float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) +#endif // __mips_msa + for (; i < size; i++) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) + float v = *intptr * scale_in; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } + } + else + { + float bias = bias_data[0]; +#if __mips_msa + v4f32 _bias0 = (v4f32)__msa_fill_w_f32(bias); + v4f32 _bias1 = _bias0; + if (bias_data_size > 1) + { + if (elempack == 8) { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = (v4f32)__msa_ld_w((const float*)bias_data + 4, 0); } - else + if (elempack == 4) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } + _bias0 = (v4f32)__msa_ld_w((const float*)bias_data, 0); + _bias1 = _bias0; } } +#endif // __mips_msa + + int i = 0; +#if __mips_msa + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); + _v0 = __msa_fmadd_w(_bias0, _v0, _scale_in0); + _v1 = __msa_fmadd_w(_bias1, _v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __msa_fmul_w(_v0, _scale_out0); + _v1 = __msa_fmul_w(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmadd_w(_bias0, _v, _scale_in0); + _v = activation_ps(_v, activation_type, activation_params); + _v = __msa_fmul_w(_v, _scale_out0); + v16i8 v = float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + intptr += 4; + ptr += 4; + } +#endif // __mips_msa + for (; i < size; i++) + { + float v = *intptr * scale_in + bias; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } } +} - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; +int Requantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 1u; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int i = ii * wp; - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const int* intptr = (const int*)bottom_blob + i * elempack; + signed char* ptr = (signed char*)top_blob + i * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + // assert scale_in_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 + // assert scale_out_data_size == 1 - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const int size = std::min(w - i, wp) * elempack; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in + bias; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1); } } - if (dims == 3) + if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); } - else + } + + if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); } } diff --git a/src/layer/mips/requantize_relu_pack4.h b/src/layer/mips/requantize_relu_pack4.h deleted file mode 100644 index e43797bd8b8..00000000000 --- a/src/layer/mips/requantize_relu_pack4.h +++ /dev/null @@ -1,263 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = top_blob.c; - int out_elempack = top_blob.elempack; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v02 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 8, 0)); - v4f32 _v03 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 12, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - v4f32 _v12 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 8, 0)); - v4f32 _v13 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 12, 0)); - _v00 = __msa_fmul_w(_v00, _scale0); - _v01 = __msa_fmul_w(_v01, _scale0); - _v02 = __msa_fmul_w(_v02, _scale0); - _v03 = __msa_fmul_w(_v03, _scale0); - _v10 = __msa_fmul_w(_v10, _scale1); - _v11 = __msa_fmul_w(_v11, _scale1); - _v12 = __msa_fmul_w(_v12, _scale1); - _v13 = __msa_fmul_w(_v13, _scale1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); - *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - _bias0 = __msa_fmul_w(_bias0, _scale_out0); - _bias1 = __msa_fmul_w(_bias1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr0 + 64); - __builtin_prefetch(intptr1 + 64); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v02 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 8, 0)); - v4f32 _v03 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 12, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - v4f32 _v12 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 8, 0)); - v4f32 _v13 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 12, 0)); - _v00 = __msa_fmadd_w(_bias0, _v00, _scale0); - _v01 = __msa_fmadd_w(_bias0, _v01, _scale0); - _v02 = __msa_fmadd_w(_bias0, _v02, _scale0); - _v03 = __msa_fmadd_w(_bias0, _v03, _scale0); - _v10 = __msa_fmadd_w(_bias1, _v10, _scale1); - _v11 = __msa_fmadd_w(_bias1, _v11, _scale1); - _v12 = __msa_fmadd_w(_bias1, _v12, _scale1); - _v13 = __msa_fmadd_w(_bias1, _v13, _scale1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); - *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); - - intptr0 += 16; - intptr1 += 16; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr0 + 32); - __builtin_prefetch(intptr1 + 32); - v4f32 _v00 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v01 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0 + 4, 0)); - v4f32 _v10 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - v4f32 _v11 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1 + 4, 0)); - _v00 = __msa_fmadd_w(_bias0, _v00, _scale0); - _v01 = __msa_fmadd_w(_bias0, _v01, _scale0); - _v10 = __msa_fmadd_w(_bias1, _v10, _scale1); - _v11 = __msa_fmadd_w(_bias1, _v11, _scale1); - *((int64_t*)ptr) = float2int8relu(_v00, _v10); - *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); - - intptr0 += 8; - intptr1 += 8; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr0 + 16); - __builtin_prefetch(intptr1 + 16); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr0, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr1, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); - - v4f32 _scale = __msa_fmul_w(_scale_in, _scale_out); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - v16i8 v = float2int8relu(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - v4f32 _scale_in = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 4, 0); - v4f32 _scale_out = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 4, 0); - - v4f32 _scale = __msa_fmul_w(_scale_in, _scale_out); - _bias = __msa_fmul_w(_bias, _scale_out); - - int i = 0; - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - v16i8 v = float2int8relu(_v); - ptr0[0] = v[0]; - ptr1[0] = v[1]; - ptr2[0] = v[2]; - ptr3[0] = v[3]; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } -} diff --git a/src/layer/mips/requantize_relu_pack8.h b/src/layer/mips/requantize_relu_pack8.h deleted file mode 100644 index 824b668cb73..00000000000 --- a/src/layer/mips/requantize_relu_pack8.h +++ /dev/null @@ -1,186 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void requantize_relu_pack8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - int scale_in_data_size = scale_in_data.w; - int scale_out_data_size = scale_out_data.w; - int bias_data_size = bias_data.w; - - // int8(relu(v * scale_in) * scale_out) - // int8_relu(v * (scale_in * scale_out)) - - // int8(relu(v * scale_in + bias) * scale_out) - // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - v4f32 _v4 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 16, 0)); - v4f32 _v5 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 20, 0)); - v4f32 _v6 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 24, 0)); - v4f32 _v7 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 28, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - _v2 = __msa_fmul_w(_v2, _scale0); - _v3 = __msa_fmul_w(_v3, _scale1); - _v4 = __msa_fmul_w(_v4, _scale0); - _v5 = __msa_fmul_w(_v5, _scale1); - _v6 = __msa_fmul_w(_v6, _scale0); - _v7 = __msa_fmul_w(_v7, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); - *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - _v2 = __msa_fmul_w(_v2, _scale0); - _v3 = __msa_fmul_w(_v3, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - v4f32 _scale_in0 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8, 0); - v4f32 _scale_in1 = scale_in_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_in_data[0]) : (v4f32)__msa_ld_w((const float*)scale_in_data + q * 8 + 4, 0); - v4f32 _scale_out0 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8, 0); - v4f32 _scale_out1 = scale_out_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_out_data[0]) : (v4f32)__msa_ld_w((const float*)scale_out_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - v4f32 _scale0 = __msa_fmul_w(_scale_in0, _scale_out0); - v4f32 _scale1 = __msa_fmul_w(_scale_in1, _scale_out1); - _bias0 = __msa_fmul_w(_bias0, _scale_out0); - _bias1 = __msa_fmul_w(_bias1, _scale_out1); - - int i = 0; - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 128); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - v4f32 _v4 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 16, 0)); - v4f32 _v5 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 20, 0)); - v4f32 _v6 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 24, 0)); - v4f32 _v7 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 28, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - _v2 = __msa_fmadd_w(_bias0, _v2, _scale0); - _v3 = __msa_fmadd_w(_bias1, _v3, _scale1); - _v4 = __msa_fmadd_w(_bias0, _v4, _scale0); - _v5 = __msa_fmadd_w(_bias1, _v5, _scale1); - _v6 = __msa_fmadd_w(_bias0, _v6, _scale0); - _v7 = __msa_fmadd_w(_bias1, _v7, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); - *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); - - intptr += 32; - ptr += 32; - } - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - _v2 = __msa_fmadd_w(_bias0, _v2, _scale0); - _v3 = __msa_fmadd_w(_bias1, _v3, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); - - intptr += 16; - ptr += 16; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - *((int64_t*)ptr) = float2int8relu(_v0, _v1); - - intptr += 8; - ptr += 8; - } - } - } -} diff --git a/src/layer/requantize.cpp b/src/layer/requantize.cpp index f98e7a320c6..79502de0809 100644 --- a/src/layer/requantize.cpp +++ b/src/layer/requantize.cpp @@ -34,11 +34,6 @@ Requantize::Requantize() int Requantize::load_param(const ParamDict& pd) { - // scale_in = pd.get(0, 1.f); // bottom_blob_scale * weight_scale - // scale_out = pd.get(1, 1.f); // top_blob_scale - // bias_term = pd.get(2, 0); - // bias_data_size = pd.get(3, 0); - scale_in_data_size = pd.get(0, 1); scale_out_data_size = pd.get(1, 1); bias_data_size = pd.get(2, 0); @@ -68,253 +63,82 @@ int Requantize::load_model(const ModelBin& mb) return 0; } +static void requantize(const int* intptr, signed char* ptr, float scale_in, float bias, float scale_out, int activation_type, const Mat& activation_params, int size) +{ + for (int i = 0; i < size; i++) + { + float v = *intptr * scale_in + bias; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } +} + int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { - int dims = bottom_blob.dims; + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; if (dims == 1) { - int w = bottom_blob.w; - top_blob.create(w, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; + // assert scale_in_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 + // assert scale_out_data_size == 1 + const int* intptr = bottom_blob; signed char* ptr = top_blob; - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - const float scale_in = scale_in_data[0]; - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { - const float scale_in = scale_in_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const float scale_in = scale_in_data[0]; + const float bias = bias_data_size == 0 ? 0.f : bias_data[0]; + const float scale_out = scale_out_data[0]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - } + requantize(intptr, ptr, scale_in, bias, scale_out, activation_type, activation_params, w); } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in + bias; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in, bias, scale_out, activation_type, activation_params, w); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in, bias, scale_out, activation_type, activation_params, w * h); } } diff --git a/src/layer/x86/requantize_x86.cpp b/src/layer/x86/requantize_x86.cpp index 0d672086270..996681e5e42 100644 --- a/src/layer/x86/requantize_x86.cpp +++ b/src/layer/x86/requantize_x86.cpp @@ -33,1511 +33,355 @@ Requantize_x86::Requantize_x86() #endif // __SSE2__ } -int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_in_data_size = scale_in_data.w; + const int bias_data_size = bias_data.w; + const int scale_out_data_size = scale_out_data.w; + const int size = elemcount * elempack; + + // NCNN_LOGE("requantize %d %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack); + float scale_in = scale_in_data[0]; #if __SSE2__ + __m128 _scale_in = _mm_set1_ps(scale_in); +#if __AVX__ + __m256 _scale_in_avx = _mm256_set1_ps(scale_in); #if __AVX512F__ - if (elempack == 16) - { - Mat tmp; - convert_packing(bottom_blob, tmp, 8, opt); - - forward(tmp, top_blob, opt); - - return 0; - } + __m512 _scale_in_avx512 = _mm512_set1_ps(scale_in); #endif // __AVX512F__ - - if (elempack == 8) +#endif // __AVX__ + if (scale_in_data_size > 1) { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { -#if __AVX__ - __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]); - __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]); -#else - __m128 _scale_in = _mm_set1_ps(scale_in_data[0]); - __m128 _scale_out = _mm_set1_ps(scale_out_data[0]); -#endif - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in); - _v1 = _mm_mul_ps(_v1, _scale_in); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else if (bias_data_size == 1) - { -#if __AVX__ - __m256 _bias = _mm256_set1_ps(bias_data[0]); -#else - __m128 _bias = _mm_set1_ps(bias_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in)); - _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { -#if __AVX__ - __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]); -#else - __m128 _scale_in = _mm_set1_ps(scale_in_data[0]); -#endif - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in); - _v1 = _mm_mul_ps(_v1, _scale_in); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else if (bias_data_size == 1) - { -#if __AVX__ - __m256 _bias = _mm256_set1_ps(bias_data[0]); -#else - __m128 _bias = _mm_set1_ps(bias_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in)); - _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { -#if __AVX__ - __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]); -#else - __m128 _scale_out = _mm_set1_ps(scale_out_data[0]); -#endif - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else if (bias_data_size == 1) - { -#if __AVX__ - __m256 _bias = _mm256_set1_ps(bias_data[0]); -#else - __m128 _bias = _mm_set1_ps(bias_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - #if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out); - _v1 = _mm_mul_ps(_v1, _scale_out); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else if (bias_data_size == 1) - { -#if __AVX__ - __m256 _bias = _mm256_set1_ps(bias_data[0]); -#else - __m128 _bias = _mm_set1_ps(bias_data[0]); -#endif - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - signed char* ptr = (signed char*)top_blob + i * 8; - -#if __AVX__ - __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8); - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - } - } - } +#if __AVX512F__ + if (elempack == 16) + { + _scale_in_avx512 = _mm512_loadu_ps((const float*)scale_in_data); } - - if (dims == 2) +#endif // __AVX512F__ + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - -#if __AVX__ - __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); -#endif - - for (int j = 0; j < w; j++) - { + _scale_in_avx = _mm256_loadu_ps((const float*)scale_in_data); +#if __AVX512F__ + _scale_in_avx512 = combine8x2_ps(_scale_in_avx, _scale_in_avx); +#endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { + _scale_in = _mm_loadu_ps((const float*)scale_in_data); #if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + _scale_in_avx = combine4x2_ps(_scale_in, _scale_in); +#if __AVX512F__ + _scale_in_avx512 = combine8x2_ps(_scale_in_avx, _scale_in_avx); +#endif // __AVX512F__ +#endif // __AVX__ + } + } +#endif // __SSE2__ + float scale_out = scale_out_data[0]; +#if __SSE2__ + __m128 _scale_out = _mm_set1_ps(scale_out); #if __AVX__ - __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8); - __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8); - __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + i * 8); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); -#endif - - for (int j = 0; j < w; j++) - { + __m256 _scale_out_avx = _mm256_set1_ps(scale_out); +#if __AVX512F__ + __m512 _scale_out_avx512 = _mm512_set1_ps(scale_out); +#endif // __AVX512F__ +#endif // __AVX__ + if (scale_out_data_size > 1) + { #if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - - intptr += 8; - ptr += 8; - } - } - } +#if __AVX512F__ + if (elempack == 16) + { + _scale_out_avx512 = _mm512_loadu_ps((const float*)scale_out_data); } - - if (dims == 3) +#endif // __AVX512F__ + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - -#if __AVX__ - __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8); - __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4); -#endif - - for (int i = 0; i < size; i++) - { + _scale_out_avx = _mm256_loadu_ps((const float*)scale_out_data); +#if __AVX512F__ + _scale_out_avx512 = combine8x2_ps(_scale_out_avx, _scale_out_avx); +#endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { + _scale_out = _mm_loadu_ps((const float*)scale_out_data); #if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_mul_ps(_v, _scale_in); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + _scale_out_avx = combine4x2_ps(_scale_out, _scale_out); +#if __AVX512F__ + _scale_out_avx512 = combine8x2_ps(_scale_out_avx, _scale_out_avx); +#endif // __AVX512F__ +#endif // __AVX__ + } + } +#endif // __SSE2__ + if (bias_data_size == 0) + { + int i = 0; +#if __SSE2__ #if __AVX__ - __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8); - __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8); - __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + q * 8); -#else - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4); -#endif - - for (int i = 0; i < size; i++) - { + for (; i + 15 < size; i += 16) + { +#if __AVX512F__ + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_mul_ps(_v, _scale_in_avx512); + _v = activation_avx512(_v, activation_type, activation_params); + _v = _mm512_mul_ps(_v, _scale_out_avx512); + _mm_storeu_si128((__m128i*)ptr, float2int8_avx512(_v)); +#else // __AVX512F__ + __m256 _v0 = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + __m256 _v1 = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)(intptr + 8))); + _v0 = _mm256_mul_ps(_v0, _scale_in_avx); + _v1 = _mm256_mul_ps(_v1, _scale_in_avx); + _v0 = activation_avx(_v0, activation_type, activation_params); + _v1 = activation_avx(_v1, activation_type, activation_params); + _v0 = _mm256_mul_ps(_v0, _scale_out_avx); + _v1 = _mm256_mul_ps(_v1, _scale_out_avx); + _mm_storeu_si128((__m128i*)ptr, float2int8_avx(_v0, _v1)); +#endif // __AVX512F__ + intptr += 16; + ptr += 16; + } +#endif // __AVX__ + for (; i + 7 < size; i += 8) + { #if __AVX__ - __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); - _v = _mm256_comp_fmadd_ps(_v, _scale_in, _bias); - _v = activation_avx(_v, activation_type, activation_params); - _v = _mm256_mul_ps(_v, _scale_out); - *(int64_t*)ptr = float2int8_avx(_v); -#else - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); -#endif - - intptr += 8; - ptr += 8; - } - } - } + __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + _v = _mm256_mul_ps(_v, _scale_in_avx); + _v = activation_avx(_v, activation_type, activation_params); + _v = _mm256_mul_ps(_v, _scale_out_avx); + *(int64_t*)ptr = float2int8_avx(_v); +#else // __AVX__ + __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); + _v0 = _mm_mul_ps(_v0, _scale_in); + _v1 = _mm_mul_ps(_v1, _scale_in); + _v0 = activation_sse(_v0, activation_type, activation_params); + _v1 = activation_sse(_v1, activation_type, activation_params); + _v0 = _mm_mul_ps(_v0, _scale_out); + _v1 = _mm_mul_ps(_v1, _scale_out); + *(int64_t*)ptr = float2int8_sse(_v0, _v1); +#endif // __AVX__ + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_mul_ps(_v, _scale_in); + _v = activation_sse(_v, activation_type, activation_params); + _v = _mm_mul_ps(_v, _scale_out); + int32_t v = float2int8_sse(_v); + ptr[0] = (v >> 0) & 0xff; + ptr[1] = (v >> 8) & 0xff; + ptr[2] = (v >> 16) & 0xff; + ptr[3] = (v >> 24) & 0xff; + intptr += 4; + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + float v = *intptr * scale_in; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __SSE2__ + __m128 _bias = _mm_set1_ps(bias); +#if __AVX__ + __m256 _bias_avx = _mm256_set1_ps(bias); +#if __AVX512F__ + __m512 _bias_avx512 = _mm512_set1_ps(bias); +#endif // __AVX512F__ +#endif // __AVX__ + if (bias_data_size > 1) { - int w = bottom_blob.w; - int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; - int outw = w * elempack / out_elempack; - - top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_in_data_size == 1 && scale_out_data_size == 1) - { - __m128 _scale_in = _mm_set1_ps(scale_in_data[0]); - __m128 _scale_out = _mm_set1_ps(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) { - __m128 _scale_in = _mm_set1_ps(scale_in_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } + _bias_avx512 = _mm512_loadu_ps((const float*)bias_data); } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) +#endif // __AVX512F__ + if (elempack == 8) { - __m128 _scale_out = _mm_set1_ps(scale_out_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } + _bias_avx = _mm256_loadu_ps((const float*)bias_data); +#if __AVX512F__ + _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); +#endif // __AVX512F__ } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) +#endif // __AVX__ + if (elempack == 4) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else if (bias_data_size == 1) - { - __m128 _bias = _mm_set1_ps(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - signed char* ptr = (signed char*)top_blob + i * 4; - - __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4); - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr[0] = (v >> 32) & 0xff; - ptr[1] = (v >> 40) & 0xff; - ptr[2] = (v >> 48) & 0xff; - ptr[3] = (v >> 56) & 0xff; - } - } + _bias = _mm_loadu_ps((const float*)bias_data); +#if __AVX__ + _bias_avx = combine4x2_ps(_bias, _bias); +#if __AVX512F__ + _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); +#endif // __AVX512F__ +#endif // __AVX__ } } +#endif // __SSE2__ - if (dims == 2) + int i = 0; +#if __SSE2__ +#if __AVX__ + for (; i + 15 < size; i += 16) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; - int outh = h * elempack / out_elempack; - - top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outh; i++) - { - const int* intptr0 = bottom_blob.row(i * 2); - const int* intptr1 = bottom_blob.row(i * 2 + 1); - signed char* ptr = top_blob.row(i); - - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); - - __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4); - - for (int j = 0; j < w; j++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr0[0] = (v >> 32) & 0xff; - ptr1[0] = (v >> 40) & 0xff; - ptr2[0] = (v >> 48) & 0xff; - ptr3[0] = (v >> 56) & 0xff; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr0 = top_blob.row(i * 4); - signed char* ptr1 = top_blob.row(i * 4 + 1); - signed char* ptr2 = top_blob.row(i * 4 + 2); - signed char* ptr3 = top_blob.row(i * 4 + 3); - - __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4); - __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4); - __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int64_t v = float2int8_sse(_v, _v); - ptr0[0] = (v >> 32) & 0xff; - ptr1[0] = (v >> 40) & 0xff; - ptr2[0] = (v >> 48) & 0xff; - ptr3[0] = (v >> 56) & 0xff; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } +#if __AVX512F__ + __m512 _v = _mm512_cvtepi32_ps(_mm512_loadu_si512((const __m512i*)intptr)); + _v = _mm512_fmadd_ps(_v, _scale_in_avx512, _bias_avx512); + _v = activation_avx512(_v, activation_type, activation_params); + _v = _mm512_mul_ps(_v, _scale_out_avx512); + _mm_storeu_si128((__m128i*)ptr, float2int8_avx512(_v)); +#else // __AVX512F__ + __m256 _v0 = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + __m256 _v1 = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)(intptr + 8))); + _v0 = _mm256_comp_fmadd_ps(_v0, _scale_in_avx, _bias_avx); + _v1 = _mm256_comp_fmadd_ps(_v1, _scale_in_avx, _bias_avx); + _v0 = activation_avx(_v0, activation_type, activation_params); + _v1 = activation_avx(_v1, activation_type, activation_params); + _v0 = _mm256_mul_ps(_v0, _scale_out_avx); + _v1 = _mm256_mul_ps(_v1, _scale_out_avx); + _mm_storeu_si128((__m128i*)ptr, float2int8_avx(_v0, _v1)); +#endif // __AVX512F__ + intptr += 16; + ptr += 16; } - - if (dims == 3) +#endif // __AVX__ + for (; i + 7 < size; i += 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; - int outc = channels * elempack / out_elempack; - - top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == 8) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_mul_ps(_v0, _scale_in0); - _v1 = _mm_mul_ps(_v1, _scale_in1); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < outc; q++) - { - const int* intptr0 = bottom_blob.channel(q * 2); - const int* intptr1 = bottom_blob.channel(q * 2 + 1); - signed char* ptr = top_blob.channel(q); - - __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8); - __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4); - __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8); - __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4); - __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8); - __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0)); - __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1)); - _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0)); - _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1)); - _v0 = activation_sse(_v0, activation_type, activation_params); - _v1 = activation_sse(_v1, activation_type, activation_params); - _v0 = _mm_mul_ps(_v0, _scale_out0); - _v1 = _mm_mul_ps(_v1, _scale_out1); - *(int64_t*)ptr = float2int8_sse(_v0, _v1); - - intptr0 += 4; - intptr1 += 4; - ptr += 8; - } - } - } - } - if (out_elempack == 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4); - __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4); - - for (int i = 0; i < size; i++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_mul_ps(_v, _scale_in); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int32_t v = float2int8_sse(_v); - ptr0[0] = (v >> 0) & 0xff; - ptr1[0] = (v >> 8) & 0xff; - ptr2[0] = (v >> 16) & 0xff; - ptr3[0] = (v >> 24) & 0xff; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr0 = top_blob.channel(q * 4); - signed char* ptr1 = top_blob.channel(q * 4 + 1); - signed char* ptr2 = top_blob.channel(q * 4 + 2); - signed char* ptr3 = top_blob.channel(q * 4 + 3); - - __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4); - __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4); - __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 4); - - for (int i = 0; i < size; i++) - { - __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); - _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in)); - _v = activation_sse(_v, activation_type, activation_params); - _v = _mm_mul_ps(_v, _scale_out); - int32_t v = float2int8_sse(_v); - ptr0[0] = (v >> 0) & 0xff; - ptr1[0] = (v >> 8) & 0xff; - ptr2[0] = (v >> 16) & 0xff; - ptr3[0] = (v >> 24) & 0xff; - - intptr += 4; - ptr0 += 1; - ptr1 += 1; - ptr2 += 1; - ptr3 += 1; - } - } - } - } +#if __AVX__ + __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr)); + _v = _mm256_comp_fmadd_ps(_v, _scale_in_avx, _bias_avx); + _v = activation_avx(_v, activation_type, activation_params); + _v = _mm256_mul_ps(_v, _scale_out_avx); + *(int64_t*)ptr = float2int8_avx(_v); +#else // __AVX__ + __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4))); + _v0 = _mm_comp_fmadd_ps(_v0, _scale_in, _bias); + _v1 = _mm_comp_fmadd_ps(_v1, _scale_in, _bias); + _v0 = activation_sse(_v0, activation_type, activation_params); + _v1 = activation_sse(_v1, activation_type, activation_params); + _v0 = _mm_mul_ps(_v0, _scale_out); + _v1 = _mm_mul_ps(_v1, _scale_out); + *(int64_t*)ptr = float2int8_sse(_v0, _v1); +#endif // __AVX__ + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr)); + _v = _mm_comp_fmadd_ps(_v, _scale_in, _bias); + _v = activation_sse(_v, activation_type, activation_params); + _v = _mm_mul_ps(_v, _scale_out); + int32_t v = float2int8_sse(_v); + ptr[0] = (v >> 0) & 0xff; + ptr[1] = (v >> 8) & 0xff; + ptr[2] = (v >> 16) & 0xff; + ptr[3] = (v >> 24) & 0xff; + intptr += 4; + ptr += 4; } - - return 0; - } #endif // __SSE2__ + for (; i < size; i++) + { + float v = *intptr * scale_in + bias; + v = activation_ss(v, activation_type, activation_params); + *ptr = float2int8(v * scale_out); + intptr++; + ptr++; + } + } +} + +int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 1u; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)1u, opt.blob_allocator); + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const int* intptr = bottom_blob; - signed char* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_in_data_size == 1 && scale_out_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale_in = scale_in_data[0]; - const float scale_out = scale_out_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + signed char* ptr = (signed char*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else if (scale_in_data_size == 1 && scale_out_data_size > 1) - { - const float scale_in = scale_in_data[0]; + // assert scale_in_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 + // assert scale_out_data_size == 1 - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int size = std::min(w - i, wp) * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - } - else if (scale_in_data_size > 1 && scale_out_data_size == 1) - { - const float scale_out = scale_out_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else // if (scale_in_data_size > 1 && scale_out_data_size > 1) - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float v = intptr[i] * scale_in_data[i] + bias_data[i]; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); - } - } + requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - for (int j = 0; j < w; j++) - { - float v = intptr[j] * scale_in + bias; - ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); - const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; - const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; - for (int i = 0; i < size; i++) - { - float v = intptr[i] * scale_in + bias; - ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); - } - } + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack); } } diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 8628249e76a..4a9d2f3739a 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -725,7 +725,9 @@ static NCNN_FORCEINLINE int64_t float2int8_avx(const __m256& _v0) __m256 _v0_p5 = _mm256_or_ps(_p5, _sign); __m256 _v0_adj = _mm256_add_ps(_v0, _v0_p5); __m256i _v0_i = _mm256_cvttps_epi32(_v0_adj); - +#if __AVX512F__ + __m128i _v8 = _mm256_cvtsepi32_epi8(_v0_i); +#else // __AVX512F__ #if __AVX2__ __m256i _v01_s16 = _mm256_packs_epi32(_v0_i, _v0_i); _v01_s16 = _mm256_permute4x64_epi64(_v01_s16, 0xd8); @@ -742,6 +744,7 @@ static NCNN_FORCEINLINE int64_t float2int8_avx(const __m256& _v0) _v01_s16low = _mm_max_epi16(_v01_s16low, _mm_set1_epi16(-127)); __m128i _v8 = _mm_packs_epi16(_v01_s16low, _v01_s16low); +#endif // __AVX512F__ #if defined(__x86_64__) || defined(_M_X64) return _mm_cvtsi128_si64(_v8); @@ -1454,7 +1457,7 @@ static NCNN_FORCEINLINE __m128i float2int8_avx512(const __m512& _v0) __m512 _v0_p5 = _mm512_or_ps(_p5, _sign); __m512 _v0_adj = _mm512_add_ps(_v0, _v0_p5); __m512i _v0_i = _mm512_cvttps_epi32(_v0_adj); - return _mm512_cvtepi32_epi8(_v0_i); + return _mm512_cvtsepi32_epi8(_v0_i); } static NCNN_FORCEINLINE __m512 bfloat2float_avx512(const __m256i& v0) diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp index 1032d529ea6..3e4fe148828 100644 --- a/tests/test_requantize.cpp +++ b/tests/test_requantize.cpp @@ -185,42 +185,12 @@ static int test_requantize_1() static int test_requantize_2() { return 0 - || test_requantize(RandomIntMat(128), 1, 1, 128) || test_requantize(RandomIntMat(128), 1, 1, 1) || test_requantize(RandomIntMat(128), 1, 1, 0) - || test_requantize(RandomIntMat(128), 128, 128, 128) - || test_requantize(RandomIntMat(128), 128, 128, 1) - || test_requantize(RandomIntMat(128), 128, 128, 0) - || test_requantize(RandomIntMat(128), 1, 128, 128) - || test_requantize(RandomIntMat(128), 1, 128, 1) - || test_requantize(RandomIntMat(128), 1, 128, 0) - || test_requantize(RandomIntMat(128), 128, 1, 128) - || test_requantize(RandomIntMat(128), 128, 1, 1) - || test_requantize(RandomIntMat(128), 128, 1, 0) - || test_requantize(RandomIntMat(124), 1, 1, 124) || test_requantize(RandomIntMat(124), 1, 1, 1) || test_requantize(RandomIntMat(124), 1, 1, 0) - || test_requantize(RandomIntMat(124), 124, 124, 124) - || test_requantize(RandomIntMat(124), 124, 124, 1) - || test_requantize(RandomIntMat(124), 124, 124, 0) - || test_requantize(RandomIntMat(124), 1, 124, 124) - || test_requantize(RandomIntMat(124), 1, 124, 1) - || test_requantize(RandomIntMat(124), 1, 124, 0) - || test_requantize(RandomIntMat(124), 124, 1, 124) - || test_requantize(RandomIntMat(124), 124, 1, 1) - || test_requantize(RandomIntMat(124), 124, 1, 0) - || test_requantize(RandomIntMat(127), 1, 1, 127) || test_requantize(RandomIntMat(127), 1, 1, 1) - || test_requantize(RandomIntMat(127), 1, 1, 0) - || test_requantize(RandomIntMat(127), 127, 127, 127) - || test_requantize(RandomIntMat(127), 127, 127, 1) - || test_requantize(RandomIntMat(127), 127, 127, 0) - || test_requantize(RandomIntMat(127), 1, 127, 127) - || test_requantize(RandomIntMat(127), 1, 127, 1) - || test_requantize(RandomIntMat(127), 1, 127, 0) - || test_requantize(RandomIntMat(127), 127, 1, 127) - || test_requantize(RandomIntMat(127), 127, 1, 1) - || test_requantize(RandomIntMat(127), 127, 1, 0); + || test_requantize(RandomIntMat(127), 1, 1, 0); } static int test_requantize_3() @@ -250,18 +220,8 @@ static int test_requantize_3() || test_requantize_pack8(RandomIntMat(15, 24), 24, 1, 24) || test_requantize_pack8(RandomIntMat(15, 24), 24, 1, 1) || test_requantize_pack8(RandomIntMat(15, 24), 24, 1, 0) - || test_requantize_pack8(RandomIntMat(128), 1, 1, 128) || test_requantize_pack8(RandomIntMat(128), 1, 1, 1) - || test_requantize_pack8(RandomIntMat(128), 1, 1, 0) - || test_requantize_pack8(RandomIntMat(128), 128, 128, 128) - || test_requantize_pack8(RandomIntMat(128), 128, 128, 1) - || test_requantize_pack8(RandomIntMat(128), 128, 128, 0) - || test_requantize_pack8(RandomIntMat(128), 1, 128, 128) - || test_requantize_pack8(RandomIntMat(128), 1, 128, 1) - || test_requantize_pack8(RandomIntMat(128), 1, 128, 0) - || test_requantize_pack8(RandomIntMat(128), 128, 1, 128) - || test_requantize_pack8(RandomIntMat(128), 128, 1, 1) - || test_requantize_pack8(RandomIntMat(128), 128, 1, 0); + || test_requantize_pack8(RandomIntMat(128), 1, 1, 0); } int main()