diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp index a79c6eed0d5..b5bd7cf7000 100644 --- a/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp +++ b/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp @@ -589,7 +589,7 @@ struct xbyak_gemm_smalln_tn : public jit_generator { int endval = (MROW < 5) ? MROW : 5; for (int ii = 8; ii < 8 + endval; ii++) { // Storing A values in zmm_reg[8-12] - vmovups(zmm_reg[ii] | (krem ? k_rem : k0), ptr[AO2]); + vmovups(zmm_reg[ii] | (krem ? k_rem : k0) | T_z, ptr[AO2]); add(AO2, LDA); } for (int ii = 0; ii < endval; ii++) { @@ -614,7 +614,7 @@ struct xbyak_gemm_smalln_tn : public jit_generator { ? 8 : MROW; // Do not process more than 8 rows here. for (int ii = 0; ii < MROW2; ii++) { - vmovups(zmm_reg[ii] | (krem ? k_rem : k0), ptr[AO2]); + vmovups(zmm_reg[ii] | (krem ? k_rem : k0) | T_z, ptr[AO2]); add(AO2, LDA); } for (int ii = 0; ii < MROW2; ii++) { @@ -627,7 +627,7 @@ struct xbyak_gemm_smalln_tn : public jit_generator { if (MROW > 8) { vmovaps(zmm_reg[0], zmm_reg[15]); for (int ii = 8; ii < MROW; ii++) { - vmovups(zmm_reg[ii] | (krem ? k_rem : k0), ptr[AO2]); + vmovups(zmm_reg[ii] | (krem ? k_rem : k0) | T_z, ptr[AO2]); add(AO2, LDA); } for (int ii = 8; ii < MROW; ii++)