Merge pull request #45 from tum-ei-eda/sync

Complete Sync with CMSIS-NN
tum-ei-eda · Jun 24, 2024 · 76a5065 · 76a5065
2 parents 0021d3f + 3db1e13
commit 76a5065
Show file tree

Hide file tree

Showing 8 changed files with 2,212 additions and 6 deletions.
diff --git a/Include/CMSIS/NN/Include/arm_nnfunctions.h b/Include/CMSIS/NN/Include/arm_nnfunctions.h
@@ -46,6 +46,7 @@ extern "C" {
 #define arm_convolve_wrapper_s16_get_buffer_size_dsp muriscv_nn_convolve_wrapper_s16_get_buffer_size_dsp
 #define arm_convolve_wrapper_s16_get_buffer_size_mve muriscv_nn_convolve_wrapper_s16_get_buffer_size_mve
 #define arm_convolve_s4 muriscv_nn_convolve_s4
+#define arm_convolve_even_s4 muriscv_nn_convolve_even_s4
 #define arm_convolve_s8 muriscv_nn_convolve_s8
 #define arm_convolve_s4_get_buffer_size muriscv_nn_convolve_s4_get_buffer_size
 #define arm_convolve_s8_get_buffer_size muriscv_nn_convolve_s8_get_buffer_size

diff --git a/Include/muriscv_nn_functions.h b/Include/muriscv_nn_functions.h
@@ -22,8 +22,8 @@
  * Title:        muriscv_nn_functions.h
  * Description:  Public header file for MURISCV NN Library
  *
- * $Date:        23 April 2024
- * $Revision:    V.16.0.0
+ * $Date:        04 Jun 2024
+ * $Revision:    V.16.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -361,6 +361,48 @@ muriscv_nn_status muriscv_nn_convolve_s4(const muriscv_nn_context *ctx,
                                     const int32_t *bias_data,
                                     const muriscv_nn_dims *output_dims,
                                     int8_t *output_data);
+
+/**
+ * @brief Basic s4 convolution function with a requirement of even number of kernels.
+ * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
+ *                                muriscv_nn_convolve_s4_get_buffer_size will return the buffer_size if required.
+ *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
+ * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
+ *                                Range of conv_params->input_offset  : [-127, 128]
+ *                                Range of conv_params->output_offset : [-128, 127]
+ * @param[in]      quant_params   Per-channel quantization info.
+ *                                It contains the multiplier and shift values to be applied to each output channel
+ * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in]      input_data     Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
+ *                                spatial filter dimensions. Note the product must be even.
+ * @param[in]      filter_data    Packed Filter data pointer. Data type: int8 packed with 2x int4
+ * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
+ * @param[in]      bias_data      Optional bias data pointer. Data type: int32
+ * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[out]     output_data    Output data pointer. Data type: int8
+ *
+ * @return     The function returns <code>MURISCV_NN_SUCCESS</code> if successful or
+ *                                  <code>MURISCV_NN_ARG_ERROR</code> if incorrect arguments or
+ *                                  <code>MURISCV_NN_NO_IMPL_ERROR</code> if not for MVE
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite micro
+ *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *
+ */
+muriscv_nn_status muriscv_nn_convolve_even_s4(const muriscv_nn_context *ctx,
+                                         const muriscv_nn_conv_params *conv_params,
+                                         const muriscv_nn_per_channel_quant_params *quant_params,
+                                         const muriscv_nn_dims *input_dims,
+                                         const int8_t *input_data,
+                                         const muriscv_nn_dims *filter_dims,
+                                         const int8_t *filter_data,
+                                         const muriscv_nn_dims *bias_dims,
+                                         const int32_t *bias_data,
+                                         const muriscv_nn_dims *output_dims,
+                                         int8_t *output_data);
+
 /**
  * @brief Basic s8 convolution function
  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.

diff --git a/Include/muriscv_nn_support_functions.h b/Include/muriscv_nn_support_functions.h
@@ -22,8 +22,8 @@
  * Title:        muriscv_nn_support_functions.h
  * Description:  Public header file of support functions for MURISCV NN Library
  *
- * $Date:        30 April 2024
- * $Revision:    V.22.0.0
+ * $Date:        27 May 2024
+ * $Revision:    V.22.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -604,6 +604,55 @@ muriscv_nn_status muriscv_nn_mat_mult_nt_t_s4(const int8_t *lhs,
                                             const int32_t activation_max,
                                             const int32_t lhs_cols_offset);
 
+/**
+ * @brief General Matrix-multiplication function with per-channel requantization.
+ *        This function assumes:
+ *        - LHS input matrix NOT transposed (nt)
+ *        - RHS input matrix transposed (t)
+ *        - RHS is int8 packed with 2x int4
+ *        - LHS is int8
+ *        - LHS/RHS input columns must be even numbered
+ *        - LHS must be interleaved. Compare to muriscv_nn_mat_mult_nt_t_s4 where LHS is not interleaved.
+ *
+ *  @note This operation also performs the broadcast bias addition before the requantization
+ *
+ * @param[in]  lhs                Pointer to the LHS input matrix
+ * @param[in]  rhs                Pointer to the RHS input matrix
+ * @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of
+ *                                output columns (or RHS input rows)
+ * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
+ * @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
+ *                                The length of this vector is equal to the number of output columns (or RHS input
+ *                                rows)
+ * @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length
+ *                                of this vector is equal to the number of output columns (or RHS input rows)
+ * @param[in]  lhs_rows           Number of LHS input rows
+ * @param[in]  rhs_rows           Number of RHS input rows
+ * @param[in]  rhs_cols           Number of LHS/RHS input columns. Note this must be even.
+ * @param[in]  lhs_offset         Offset to be applied to the LHS input value
+ * @param[in]  dst_offset         Offset to be applied the output result
+ * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
+ * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
+ * @param[in]  lhs_cols_offset    Column offset between subsequent lhs_rows
+ *
+ * @return     The function returns <code>MURISCV_NN_SUCCESS</code>
+ *
+ */
+muriscv_nn_status muriscv_nn_mat_mult_nt_interleaved_t_even_s4(const int8_t *lhs,
+                                                             const int8_t *rhs,
+                                                             const int32_t *bias,
+                                                             int8_t *dst,
+                                                             const int32_t *dst_multipliers,
+                                                             const int32_t *dst_shifts,
+                                                             const int32_t lhs_rows,
+                                                             const int32_t rhs_rows,
+                                                             const int32_t rhs_cols,
+                                                             const int32_t lhs_offset,
+                                                             const int32_t dst_offset,
+                                                             const int32_t activation_min,
+                                                             const int32_t activation_max,
+                                                             const int32_t lhs_cols_offset);
+
 /**
  * @brief General Matrix-multiplication function with per-channel requantization.
  *        This function assumes:

diff --git a/Integration/tvm/setup_tvm.sh b/Integration/tvm/setup_tvm.sh
@@ -32,6 +32,7 @@ BUILDS=(mlf mlf_vext mlf_pext)
 echo "Download and install TVM sources."
 python3 -m venv .venv
 source .venv/bin/activate
+pip install numpy==1.26.4
 pip install apache-tvm
 pip install -r requirements.txt
 pip install typing-extensions

diff --git a/Source/ConvolutionFunctions/CMakeLists.txt b/Source/ConvolutionFunctions/CMakeLists.txt
@@ -54,4 +54,5 @@ target_sources(${MURISCVNN_LIB} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_c
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_convolve_wrapper_s4.c
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_mat_mult_kernel_row_offset_s8_s16.c
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_mat_mult_kernel_s16.c
-                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_convolve_1_x_n_s4.c)
+                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_convolve_1_x_n_s4.c
+                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_convolve_even_s4.c)
diff --git a/Source/ConvolutionFunctions/muriscv_nn_convolve_even_s4.c b/Source/ConvolutionFunctions/muriscv_nn_convolve_even_s4.c
@@ -0,0 +1,231 @@
+// Modifications copyright (C) 2024 Chair of Electronic Design Automation, TUM
+/*
+ * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <[email protected]>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        muriscv_nn_convolve_even_s4.c
+ * Description:  s8 version of convolution using symmetric quantization with 4 bit weights.
+ *
+ * $Date:        05 Jun 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "muriscv_nn_functions.h"
+#include "muriscv_nn_support_functions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * Basic s8 convolution function with int4 packed RHS (weights) and even RHS columns,
+ *
+ * Refer header file for details.
+ *
+ */
+muriscv_nn_status muriscv_nn_convolve_even_s4(const muriscv_nn_context *ctx,
+                                         const muriscv_nn_conv_params *conv_params,
+                                         const muriscv_nn_per_channel_quant_params *quant_params,
+                                         const muriscv_nn_dims *input_dims,
+                                         const int8_t *input_data,
+                                         const muriscv_nn_dims *filter_dims,
+                                         const int8_t *packed_filter_data,
+                                         const muriscv_nn_dims *bias_dims,
+                                         const int32_t *bias_data,
+                                         const muriscv_nn_dims *output_dims,
+                                         int8_t *output_data)
+{
+    (void)bias_dims;
+
+//#if defined(USE_VEXT)
+//
+//    if (ctx->buf == NULL)
+//    {
+//        return MURISCV_NN_ARG_ERROR;
+//    }
+//
+//    int16_t *buffer_a = (int16_t *)ctx->buf;
+//
+//    const int32_t input_batches = input_dims->n;
+//    const uint16_t input_x = input_dims->w;
+//    const uint16_t input_y = input_dims->h;
+//    const uint16_t input_ch = input_dims->c;
+//    const uint16_t kernel_x = filter_dims->w;
+//    const uint16_t kernel_y = filter_dims->h;
+//    const uint16_t output_x = output_dims->w;
+//    const uint16_t output_y = output_dims->h;
+//    const uint16_t output_ch = output_dims->c;
+//
+//    const uint16_t pad_x = conv_params->padding.w;
+//    const uint16_t pad_y = conv_params->padding.h;
+//    const uint16_t stride_x = conv_params->stride.w;
+//    const uint16_t stride_y = conv_params->stride.h;
+//    const int32_t dilation_x = conv_params->dilation.w;
+//    const int32_t dilation_y = conv_params->dilation.h;
+//    const int32_t out_offset = conv_params->output_offset;
+//    const int32_t out_activation_min = conv_params->activation.min;
+//    const int32_t out_activation_max = conv_params->activation.max;
+//    const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
+//    const int32_t input_offset = conv_params->input_offset;
+//
+//    if (rhs_cols & 0x1)
+//    {
+//        return MURISCV_NN_ARG_ERROR;
+//    }
+//
+//    const int32_t blk_cnt = rhs_cols >> 5;
+//
+//    int32_t *output_mult = quant_params->multiplier;
+//    int32_t *output_shift = quant_params->shift;
+//
+//    int i_batch;
+//
+//    for (i_batch = 0; i_batch < input_batches; i_batch++)
+//    {
+//        /* Generate up to four columns from the input tensor a GEMM computation */
+//        int8_t *im2col_buf = (int8_t *)buffer_a;
+//        const int32_t rhs_rows = output_dims->c;
+//        int8_t *out = output_data;
+//        int32_t lhs_rows = 0;
+//
+//        /* This part implements the im2col function */
+//        for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+//        {
+//            for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+//            {
+//                const int32_t base_idx_x = stride_x * i_out_x - pad_x;
+//                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
+//
+//                for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
+//                {
+//                    for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
+//                    {
+//                        const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
+//                        const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
+//
+//                        if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
+//                        {
+//                            muriscv_nn_memset_s8(im2col_buf, (int8_t)-input_offset, sizeof(int8_t) * input_ch);
+//                        }
+//                        else
+//                        {
+//                            muriscv_nn_memcpy_s8(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
+//                        }
+//                        im2col_buf += input_ch;
+//                    }
+//                }
+//
+//                /* Reformat most of the buffer by interleaving it */
+//                int8_t *im2col_buf_interleaved = (int8_t *)buffer_a + lhs_rows * rhs_cols;
+//                for (int j = blk_cnt; j > 0; --j)
+//                {
+//                    int8x16x2_t x2 = vld2q_s8(im2col_buf_interleaved);
+//
+//                    vstrbq_s8(im2col_buf_interleaved, x2.val[1]);
+//                    im2col_buf_interleaved += 16;
+//
+//                    vstrbq_s8(im2col_buf_interleaved, x2.val[0]);
+//                    im2col_buf_interleaved += 16;
+//                }
+//
+//                lhs_rows++;
+//
+//                /* Computation is filed for every 4 columns */
+//                if (lhs_rows == 4)
+//                {
+//                    muriscv_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
+//                                                             packed_filter_data,
+//                                                             bias_data,
+//                                                             out,
+//                                                             output_mult,
+//                                                             output_shift,
+//                                                             lhs_rows,
+//                                                             rhs_rows,
+//                                                             rhs_cols,
+//                                                             input_offset,
+//                                                             out_offset,
+//                                                             out_activation_min,
+//                                                             out_activation_max,
+//                                                             rhs_cols);
+//
+//                    out += lhs_rows * rhs_rows;
+//
+//                    lhs_rows = 0;
+//                    im2col_buf = (int8_t *)buffer_a;
+//                }
+//            }
+//        }
+//
+//        /* Handle left over columns */
+//        if (lhs_rows != 0)
+//        {
+//            muriscv_nn_mat_mult_nt_interleaved_t_even_s4((int8_t *)buffer_a,
+//                                                     packed_filter_data,
+//                                                     bias_data,
+//                                                     out,
+//                                                     output_mult,
+//                                                     output_shift,
+//                                                     lhs_rows,
+//                                                     rhs_rows,
+//                                                     rhs_cols,
+//                                                     input_offset,
+//                                                     out_offset,
+//                                                     out_activation_min,
+//                                                     out_activation_max,
+//                                                     rhs_cols);
+//            out += lhs_rows * rhs_rows;
+//            lhs_rows = 0;
+//            im2col_buf = (int8_t *)buffer_a;
+//        }
+//
+//        /* Advance to the next batch */
+//        input_data += (input_x * input_y * input_ch);
+//        output_data += (output_x * output_y * output_ch);
+//    }
+//#else
+    (void)ctx;
+    (void)conv_params;
+    (void)quant_params;
+    (void)input_dims;
+    (void)input_data;
+    (void)filter_dims;
+    (void)packed_filter_data;
+    (void)bias_data;
+    (void)output_dims;
+    (void)output_data;
+
+    return MURISCV_NN_NO_IMPL_ERROR;
+
+//#endif // #if defined(USE_VEXT)
+//
+//    /* Return to application */
+//    return MURISCV_NN_SUCCESS;
+}
+//
+///**
+// * @} end of NNConv group
+// */
diff --git a/Source/NNSupportFunctions/CMakeLists.txt b/Source/NNSupportFunctions/CMakeLists.txt
@@ -41,4 +41,5 @@ target_sources(${MURISCVNN_LIB} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_d
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_vec_mat_mul_result_acc_s16.c
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_lstm_step_s16.c
                                         ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_mat_mul_core_1x_s4.c
-                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_depthwise_conv_nt_t_s4.c)
+                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_depthwise_conv_nt_t_s4.c
+                                        ${CMAKE_CURRENT_SOURCE_DIR}/muriscv_nn_mat_mult_nt_interleaved_t_even_s4.c)