diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5c311e3b..10c0299b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -45,6 +45,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", }, { "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw non-linear quantization", }, { "IQ5_K", LLAMA_FTYPE_MOSTLY_IQ5_K, " 5.5 bpw non-linear quantization", }, + { "IQ6_K", LLAMA_FTYPE_MOSTLY_IQ6_K, " 6.6 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 144e87f5..a0bcc67f 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -393,7 +393,8 @@ extern "C" { GGML_TYPE_IQ3_K = 38, GGML_TYPE_IQ4_K = 39, GGML_TYPE_IQ5_K = 40, - GGML_TYPE_IQ2_TN = 41, + GGML_TYPE_IQ6_K = 41, + GGML_TYPE_IQ2_TN = 42, GGML_TYPE_COUNT, }; @@ -444,7 +445,8 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_K = 31, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_K = 32, // except 1d tensors GGML_FTYPE_MOSTLY_IQ5_K = 33, // except 1d tensors - GGML_FTYPE_MOSTLY_IQ2_TN = 34, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ6_K = 34, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_TN = 35, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 5847d903..b58e3e4c 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -142,6 +142,9 @@ typedef sycl::half2 ggml_half2; #define QI5_XS (QK_K / (4*QR5_XS)) #define QR5_XS 2 +#define QI6_XS (QK_K / (4*QR6_XS)) +#define QR6_XS 2 + #define QI3_S (QK_K / (4*QR3_S)) #define QR3_S 4 @@ -493,6 +496,15 @@ typedef struct { } block_iq5_k; static_assert(sizeof(block_iq5_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/8 + 3*QK_K/64, "wrong iq5_k block size/padding"); +typedef struct { + ggml_half d; + uint16_t extra; + int8_t scales[QK_K/16]; + uint8_t qs[QK_K/2]; + uint8_t qh[QK_K/4]; +} block_iq6_k; +static_assert(sizeof(block_iq6_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/4 + QK_K/16, "wrong iq6_k block size/padding"); + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL @@ -1944,6 +1956,16 @@ GGML_TABLE_BEGIN(int8_t, iq5nl_values, 64) -124, -112, -101, -90, -81, -72, -63, -55, -48, -41, -34, -28, -22, -16, -10, -4, 1, 7, 13, 19, 25, 31, 38, 45, 53, 61, 70, 79, 89, 99, 111, 123, GGML_TABLE_END() +GGML_TABLE_BEGIN(int8_t, iq6nl_values, 128) + -127, -121, -115, -109, -104, -98, -93, -88, -84, -79, -74, -70, -66, -62, -58, -54, + -51, -47, -44, -40, -37, -34, -31, -28, -25, -22, -19, -16, -13, -11, -8, -5, + -2, 0, 3, 6, 9, 12, 14, 17, 20, 23, 27, 30, 33, 36, 40, 44, + 47, 51, 55, 59, 63, 68, 72, 77, 82, 87, 92, 98, 103, 109, 115, 121, + -126, -120, -114, -108, -103, -97, -92, -87, -83, -78, -73, -69, -65, -61, -57, -53, + -50, -46, -43, -39, -36, -33, -30, -27, -24, -21, -18, -15, -12, -10, -7, -4, + -1, 1, 4, 7, 10, 13, 15, 18, 21, 24, 28, 31, 34, 37, 41, 45, + 48, 52, 56, 60, 64, 69, 73, 78, 83, 88, 93, 99, 104, 110, 116, 122, +GGML_TABLE_END() #endif // GGML_COMMON_IMPL #endif // GGML_COMMON_IMPL diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index a115a1b4..7641d5b5 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2757,6 +2757,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ2_BN: case GGML_TYPE_IQ2_TN: diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c18e865a..07a53bcd 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -704,6 +704,13 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI5_XS; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = QR6_XS; + static constexpr int qi = QI6_XS; +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 47ab92f0..f76c80dc 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -591,6 +591,37 @@ static __global__ void dequantize_block_iq5_k(const void * __restrict__ vx, dst_ } } +template +static __global__ void dequantize_block_iq6_k(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq6_k * x = (const block_iq6_k *) vx; + + const int tid = threadIdx.x; + int ib64 = tid/8; // 0...3 + int il = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 64*ib64 + 2*il; + const float d = (float)x[i].d; + const float dl1 = d * x[i].scales[4*ib64+0]; + const float dl2 = d * x[i].scales[4*ib64+1]; + const float dl3 = d * x[i].scales[4*ib64+2]; + const float dl4 = d * x[i].scales[4*ib64+3]; + const uint8_t * qs = x[i].qs + 32*ib64 + 2*il; + const uint8_t * qh = x[i].qh + 32*(ib64/2) + 2*il; + const uint8_t extra = x[i].extra >> 4*(ib64%4); + for (int j = 0; j < 2; ++j) { + const uint8_t h1 = qh[j] >> 4*(ib64%2), h2 = qh[j+16] >> 4*(ib64%2); + uint8_t q1 = (qs[j+ 0] & 0xf) | ((h1 & 0x03) << 4); + uint8_t q2 = (qs[j+16] & 0xf) | ((h2 & 0x03) << 4); + uint8_t q3 = (qs[j+ 0] >> 4) | ((h1 & 0x0c) << 2); + uint8_t q4 = (qs[j+16] >> 4) | ((h2 & 0x0c) << 2); + y[j+ 0] = dl1 * (iq6nl_values[q1] + (extra & 1 ? 1 : 0)); + y[j+16] = dl2 * (iq6nl_values[q2] + (extra & 2 ? 1 : 0)); + y[j+32] = dl3 * (iq6nl_values[q3] + (extra & 4 ? 1 : 0)); + y[j+48] = dl4 * (iq6nl_values[q4] + (extra & 8 ? 1 : 0)); + } +} + template static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -803,6 +834,12 @@ static void dequantize_row_iq5_k_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq5_k<<>>(vx, y); } +template +static void dequantize_row_iq6_k_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq6_k<<>>(vx, y); +} + template static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; @@ -877,6 +914,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq4_k_cuda; case GGML_TYPE_IQ5_K: return dequantize_row_iq5_k_cuda; + case GGML_TYPE_IQ6_K: + return dequantize_row_iq6_k_cuda; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: @@ -938,6 +977,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq4_k_cuda; case GGML_TYPE_IQ5_K: return dequantize_row_iq5_k_cuda; + case GGML_TYPE_IQ6_K: + return dequantize_row_iq6_k_cuda; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu index 8def1547..29721cdd 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cu +++ b/ggml/src/ggml-cuda/iqk_mmvq.cu @@ -251,6 +251,43 @@ __device__ __forceinline__ float vec_dot_iq5_k_q8_1( return d5 * (__low2float(bq8_1[2*(i4/2)+0].ds) * sumi1 * ls1 + __low2float(bq8_1[2*(i4/2)+1].ds) * sumi2 * ls2); } +#define VDR_IQ6_K_Q8_1_MMVQ 4 +#define VDR_IQ6_K_Q8_1_MMQ 4 + +__device__ __forceinline__ float vec_dot_iq6_k_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + + const block_iq6_k * bq6 = (const block_iq6_k *) vbq + kbx; + const uint8_t * all_values = (const uint8_t *)iq6nl_values; + + int i4 = iqs/4; // 0...7. Blocks of 16 index is 4*(i4/2) + (i4%2) + (0 and 2) + // Blocks of 32 index is 2*(i4/2) + 0 or 1 + + const int32_t * q8_1 = (const int *)bq8_1[2*(i4/2)+0].qs + 4*(i4%2); + const int32_t * q8_2 = (const int *)bq8_1[2*(i4/2)+1].qs + 4*(i4%2); + const uint32_t * q4 = (const uint32_t *)bq6->qs + 8*(i4/2) + 4*(i4%2); + const uint32_t * qh = (const uint32_t *)bq6->qh + 8*(i4/4) + 4*(i4%2); + const uint16_t extra = bq6->extra >> (4*(i4/2) + (i4%2)); + const uint8_t * values1 = all_values + 64*(extra & 1); + const uint8_t * values2 = all_values + 16*(extra & 4); + uint32_t aux32[2]; + const uint8_t * a8 = (const uint8_t *)aux32; + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + uint32_t h = qh[j] >> 4*((i4/2)%2); + aux32[0] = ((q4[j] >> 0) & 0x0f0f0f0f) | ((h << 4) & 0x30303030); + aux32[1] = ((q4[j] >> 4) & 0x0f0f0f0f) | ((h << 2) & 0x30303030); + v1 = int_from_table(a8+0, values1); + v2 = int_from_table(a8+4, values2); + sumi1 = ggml_cuda_dp4a(v1, q8_1[j], sumi1); + sumi2 = ggml_cuda_dp4a(v2, q8_2[j], sumi2); + } + const float d6 = __half2float(bq6->d); + return d6 * (__low2float(bq8_1[2*(i4/2)+0].ds) * sumi1 * bq6->scales[4*(i4/2)+(i4%2)] + __low2float(bq8_1[2*(i4/2)+1].ds) * sumi2 * bq6->scales[4*(i4/2)+(i4%2)+2]); +} + static const __device__ uint32_t iq2k_table[512] = { 0xe1e1e1e1, 0xe1e1e1f3, 0xe1e1e101, 0xe1e1e111, 0xe1e1f3e1, 0xe1e1f3f3, 0xe1e1f301, 0xe1e1f311, 0xe1e101e1, 0xe1e101f3, 0xe1e10101, 0xe1e10111, 0xe1e111e1, 0xe1e111f3, 0xe1e11101, 0xe1e11111, @@ -534,10 +571,16 @@ void mul_mat_vec_iq5_k_q8_1_cuda( iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } +void mul_mat_vec_iq6_k_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + void mul_mat_vec_iq2_tn_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { iqk_mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } - diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cuh b/ggml/src/ggml-cuda/iqk_mmvq.cuh index 3dc5f41c..7af8e570 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq.cuh @@ -16,6 +16,10 @@ void mul_mat_vec_iq5_k_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); +void mul_mat_vec_iq6_k_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); + void mul_mat_vec_iq2_tn_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 428d822f..9eb3fa4f 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -447,6 +447,9 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ5_K: mul_mat_vec_iq5_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ6_K: + mul_mat_vec_iq6_k_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; case GGML_TYPE_IQ3_S: mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index d54d252c..7d592c22 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -95,6 +95,7 @@ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ6_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -130,6 +131,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -161,6 +163,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -189,6 +192,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -217,6 +221,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ6_K_F32, GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, @@ -589,6 +594,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K, get_rows_iq3_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K, get_rows_iq4_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K, get_rows_iq5_k, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ6_K, get_rows_iq6_k, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -624,6 +630,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_K_F32, mul_mv_iq3_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_K_F32, mul_mv_iq4_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_K_F32, mul_mv_iq5_k_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ6_K_F32, mul_mv_iq6_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -655,6 +662,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_K_F32, mul_mv_id_iq3_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_K_F32, mul_mv_id_iq4_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_K_F32, mul_mv_id_iq5_k_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ6_K_F32, mul_mv_id_iq6_k_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -683,6 +691,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32, mul_mm_iq3_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32, mul_mm_iq4_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32, mul_mm_iq5_k_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ6_K_F32, mul_mm_iq6_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -711,6 +720,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32, mul_mm_id_iq3_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32, mul_mm_id_iq4_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32, mul_mm_id_iq5_k_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ6_K_F32, mul_mm_id_iq6_k_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true); @@ -1745,6 +1755,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_K_F32 ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ5_K_F32 ].pipeline; break; + case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ6_K_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1957,6 +1968,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ5_K_F32].pipeline; } break; + case GGML_TYPE_IQ6_K: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ6_K_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -2002,8 +2019,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS || src0t == GGML_TYPE_IQ4_K || src0t == GGML_TYPE_IQ5_K) { - const int mem_size = src0t == GGML_TYPE_IQ5_K ? 64*sizeof(float) : 32*sizeof(float); + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS || src0t == GGML_TYPE_IQ4_K || + src0t == GGML_TYPE_IQ5_K || src0t == GGML_TYPE_IQ6_K) { + const int mem_size = src0t == GGML_TYPE_IQ6_K ? 128*sizeof(float) : GGML_TYPE_IQ5_K ? 64*sizeof(float) : 32*sizeof(float); [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -2098,6 +2116,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_K_F32 ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_K_F32 ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ5_K_F32 ].pipeline; break; + case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ6_K_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -2304,6 +2323,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ5_K_F32].pipeline; } break; + case GGML_TYPE_IQ6_K: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ6_K_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -2360,8 +2385,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS || src0t == GGML_TYPE_IQ4_K || src0t == GGML_TYPE_IQ5_K ) { - const int mem_size = src0t == GGML_TYPE_IQ5_K ? 64*sizeof(float) : 32*sizeof(float); + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS || src0t == GGML_TYPE_IQ4_K || + src0t == GGML_TYPE_IQ5_K || src0t == GGML_TYPE_IQ6_K) { + const int mem_size = src0t == GGML_TYPE_IQ6_K ? 128*sizeof(float) : GGML_TYPE_IQ5_K ? 64*sizeof(float) : 32*sizeof(float); [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -2415,6 +2441,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_K ].pipeline; break; case GGML_TYPE_IQ4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_K ].pipeline; break; case GGML_TYPE_IQ5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ5_K ].pipeline; break; + case GGML_TYPE_IQ6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ6_K ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 1366905d..904639a5 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -3066,6 +3066,16 @@ constexpr constant static float kvalues_iq5k_f[64] = { -124.f, -112.f, -101.f, -90.f, -81.f, -72.f, -63.f, -55.f, -48.f, -41.f, -34.f, -28.f, -22.f, -16.f, -10.f, -4.f, 1.f, 7.f, 13.f, 19.f, 25.f, 31.f, 38.f, 45.f, 53.f, 61.f, 70.f, 79.f, 89.f, 99.f, 111.f, 123.f, }; +constexpr constant static float kvalues_iq6k_f[128] = { + -127.f, -121.f, -115.f, -109.f, -104.f, -98.f, -93.f, -88.f, -84.f, -79.f, -74.f, -70.f, -66.f, -62.f, -58.f, -54.f, + -51.f, -47.f, -44.f, -40.f, -37.f, -34.f, -31.f, -28.f, -25.f, -22.f, -19.f, -16.f, -13.f, -11.f, -8.f, -5.f, + -2.f, 0.f, 3.f, 6.f, 9.f, 12.f, 14.f, 17.f, 20.f, 23.f, 27.f, 30.f, 33.f, 36.f, 40.f, 44.f, + 47.f, 51.f, 55.f, 59.f, 63.f, 68.f, 72.f, 77.f, 82.f, 87.f, 92.f, 98.f, 103.f, 109.f, 115.f, 121.f, + -126.f, -120.f, -114.f, -108.f, -103.f, -97.f, -92.f, -87.f, -83.f, -78.f, -73.f, -69.f, -65.f, -61.f, -57.f, -53.f, + -50.f, -46.f, -43.f, -39.f, -36.f, -33.f, -30.f, -27.f, -24.f, -21.f, -18.f, -15.f, -12.f, -10.f, -7.f, -4.f, + -1.f, 1.f, 4.f, 7.f, 10.f, 13.f, 15.f, 18.f, 21.f, 24.f, 28.f, 31.f, 34.f, 37.f, 41.f, 45.f, + 48.f, 52.f, 56.f, 60.f, 64.f, 69.f, 73.f, 78.f, 83.f, 88.f, 93.f, 99.f, 104.f, 110.f, 116.f, 122.f, +}; constexpr constant static float kvalues_iq2k_f[8] = { -31.f, -13.f, 1.f, 17.f, -26.f, -8.f, 6.f, 22.f }; @@ -5788,6 +5798,110 @@ void kernel_mul_mv_iq5_k_f32_impl( } } +void kernel_mul_mv_iq6_k_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values_i8, + uint3 tgpig, + uint tiisg, + uint sgitg) { + + threadgroup float * shared_values = (threadgroup float *)shared_values_i8; + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * 2 + sgitg) * 2; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq6_k * x = (device const block_iq6_k *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + const int ix = tiisg/16; // 0 or 1 + const int it = tiisg%16; // 0...15 + const int ib64 = it/4; // 0...3 + const int il64 = it%4; // 0...3 + + shared_values[4*tiisg+0] = kvalues_iq6k_f[4*tiisg+0]; + shared_values[4*tiisg+1] = kvalues_iq6k_f[4*tiisg+1]; + shared_values[4*tiisg+2] = kvalues_iq6k_f[4*tiisg+2]; + shared_values[4*tiisg+3] = kvalues_iq6k_f[4*tiisg+3]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + float4 yl[4]; + float sumf[2]={0.f}, all_sum; + + device const float * yb = y + ix * QK_K + ib64 * 64 + il64 * 8; + + uint32_t aux32[2]; + thread const uint8_t * q8 = (thread const uint8_t *)aux32; + + float4 qf1, qf2; + + for (int ibl = ix; ibl < nb; ibl += 2) { + + device const float4 * y4 = (device const float4 *)yb; + yl[0] = y4[0]; yl[1] = y4[8]; yl[2] = y4[1]; yl[3] = y4[9]; + + for (int row = 0; row < 2; ++row) { + + device const block_iq6_k & xb = x[row*nb + ibl]; + device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 32*ib64 + 8*il64); + device const uint32_t * qh = (device const uint32_t *)(xb.qh + 32*(ib64/2) + 8*il64); + + uint16_t extra = xb.extra >> (4*ib64 + il64/2); + threadgroup const float * values1 = shared_values + 64*(extra & 1); + threadgroup const float * values2 = shared_values + 16*(extra & 4); + + float4 acc1 = {0.f}, acc2 = {0.f}; + + uint32_t h = qh[0] >> 4*(ib64%2); + aux32[0] = ((q4[0] >> 0) & 0x0f0f0f0f) | ((h << 4) & 0x30303030); + aux32[1] = ((q4[0] >> 4) & 0x0f0f0f0f) | ((h << 2) & 0x30303030); + qf1 = {values1[q8[0]], values1[q8[1]], values1[q8[2]], values1[q8[3]]}; + qf2 = {values2[q8[4]], values2[q8[5]], values2[q8[6]], values2[q8[7]]}; + acc1 += yl[0] * qf1; + acc2 += yl[1] * qf2; + + h = qh[1] >> 4*(ib64%2); + aux32[0] = ((q4[1] >> 0) & 0x0f0f0f0f) | ((h << 4) & 0x30303030); + aux32[1] = ((q4[1] >> 4) & 0x0f0f0f0f) | ((h << 2) & 0x30303030); + qf1 = {values1[q8[0]], values1[q8[1]], values1[q8[2]], values1[q8[3]]}; + qf2 = {values2[q8[4]], values2[q8[5]], values2[q8[6]], values2[q8[7]]}; + acc1 += yl[2] * qf1; + acc2 += yl[3] * qf2; + + const int ls1 = xb.scales[4*ib64 + 0 + il64/2]; + const int ls2 = xb.scales[4*ib64 + 2 + il64/2]; + sumf[row] += (float)xb.d * (ls1 * (acc1[0] + acc1[1] + acc1[2] + acc1[3]) + ls2 * (acc2[0] + acc2[1] + acc2[2] + acc2[3])); + + } + + yb += 2 * QK_K; + } + + for (int row = 0; row < 2; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( device const void * src0, @@ -6016,6 +6130,35 @@ kernel void kernel_mul_mv_iq5_k_f32( kernel_mul_mv_iq5_k_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +[[host_name("kernel_mul_mv_iq6_k_f32")]] +kernel void kernel_mul_mv_iq6_k_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq6_k_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + //============================= templates and their specializations ============================= // NOTE: this is not dequantizing - we are simply fitting the template @@ -6582,6 +6725,27 @@ void dequantize_iq5_k(device const block_iq5_k * xb, short il, thread type4x4 & } } +template +void dequantize_iq6_k(device const block_iq6_k * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + const int l = il%2; + // l = 0 or 1. l = 0 processes the first 16 quants in a block of 32, l = 1 the second 16 + device const uint32_t * q4 = (device const uint32_t *)xb->qs + 8*(ib32/2) + 4*l; + device const uint32_t * qh = (device const uint32_t *)xb->qh + 8*(ib32/4) + 4*l; + const float d = (float)xb->d * xb->scales[2*ib32+l]; + uint32_t aux32; + thread const uint8_t * q8 = (thread const uint8_t *)&aux32; + constant float * values = kvalues_iq6k_f + 64*((xb->extra >> il) & 1); + for (int i = 0; i < 4; ++i) { + aux32 = ((q4[i] >> 4*(ib32%2)) & 0x0f0f0f0f) | (((qh[i] >> 2*(ib32%4)) & 0x03030303) << 4); + reg[i][0] = d * values[q8[0]]; + reg[i][1] = d * values[q8[1]]; + reg[i][2] = d * values[q8[2]]; + reg[i][3] = d * values[q8[3]]; + } +} + template kernel void kernel_get_rows_q( device const void * src0, @@ -7048,6 +7212,7 @@ template [[host_name("kernel_get_rows_iq2_k")]] kernel get_rows_q_t kernel_get template [[host_name("kernel_get_rows_iq3_k")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq4_k")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq5_k")]] kernel get_rows_q_t kernel_get_rows_q; +template [[host_name("kernel_get_rows_iq6_k")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq1_bn")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq2_bn")]] kernel get_rows_q_t kernel_get_rows_q; @@ -7083,6 +7248,7 @@ template [[host_name("kernel_mul_mm_iq2_k_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq3_k_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_k_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq5_k_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq6_k_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_bn_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_bn_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -7120,6 +7286,7 @@ template [[host_name("kernel_mul_mm_id_iq2_k_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq3_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq5_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq6_k_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -7336,3 +7503,4 @@ template [[host_name("kernel_mul_mv_id_iq2_k_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq3_k_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_k_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq5_k_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq6_k_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 9b3fddbc..99bd682f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -14996,6 +14996,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte case GGML_TYPE_IQ3_K: break; case GGML_TYPE_IQ4_K: break; case GGML_TYPE_IQ5_K: break; + case GGML_TYPE_IQ6_K: break; case GGML_TYPE_IQ2_TN: break; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5c817030..b5fdb96d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1040,6 +1040,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ6_K] = { + .type_name = "iq6_k", + .blck_size = QK_K, + .type_size = sizeof(block_iq6_k), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq6_k, + .from_float = quantize_row_iq6_k, + .from_float_ref = (ggml_from_float_t)quantize_row_iq6_k_ref, + .vec_dot = vec_dot_iq6_k_q8_k, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, }; // For internal test use @@ -3394,6 +3406,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break; case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break; case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break; + case GGML_FTYPE_MOSTLY_IQ6_K: wtype = GGML_TYPE_IQ6_K; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; @@ -9648,6 +9661,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -10033,6 +10047,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -10168,6 +10183,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -13092,6 +13108,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -13287,6 +13304,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -13556,6 +13574,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q4_0_4_4: @@ -14152,6 +14171,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_K: case GGML_TYPE_IQ4_K: case GGML_TYPE_IQ5_K: + case GGML_TYPE_IQ6_K: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: @@ -20892,6 +20912,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ6_K: result = quantize_iq6_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index db83b841..32ddb3ff 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -774,6 +774,40 @@ struct IQXKScales { const __m256i shuffle1 = _mm256_set_epi64x(0x0b0b0b0b09090909, 0x0303030301010101, 0x0a0a0a0a08080808, 0x0202020200000000); const __m256i shuffle2 = _mm256_set_epi64x(0x0f0f0f0f0d0d0d0d, 0x0707070705050505, 0x0e0e0e0e0c0c0c0c, 0x0606060604040404); }; +struct IQXKScales2 { + IQXKScales2(uint8_t shift, int8_t min_val) : eshift(_mm256_set1_epi16(shift)), min(_mm256_set1_epi16(min_val)) {} + template + inline void process(int i, float d, uint16_t extra, __m128i scales8, const Q8& q8, __m256 * accm, __m512i * scales) const { + process(i, d, extra, _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales8, scale_shuffle)), q8, accm, scales); + } + template + inline void process(int i, float d, uint16_t extra, __m256i scales16, const Q8& q8, __m256 * accm, __m512i * scales) const { + auto scales_s = _mm256_mullo_epi16(scales16, _mm256_mask_add_epi16(min, extra, min, eshift)); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i prod = _mm256_madd_epi16(scales_s, q8.load_bsums(iy, i)); + accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]); + } + auto aux_1 = MM256_SET_M128I(_mm256_castsi256_si128(scales16), _mm256_castsi256_si128(scales16)); + auto aux_2 = MM256_SET_M128I(_mm256_extracti128_si256(scales16, 1), _mm256_extracti128_si256(scales16, 1)); + auto scales16_1 = _mm512_inserti32x8(_mm512_castsi256_si512(aux_1), aux_1, 1); + auto scales16_2 = _mm512_inserti32x8(_mm512_castsi256_si512(aux_2), aux_2, 1); + scales[0] = _mm512_shuffle_epi8(scales16_1, shuffles[0]); + scales[1] = _mm512_shuffle_epi8(scales16_1, shuffles[1]); + scales[2] = _mm512_shuffle_epi8(scales16_2, shuffles[0]); + scales[3] = _mm512_shuffle_epi8(scales16_2, shuffles[1]); + } + const __m256i eshift; + const __m256i min; + const __m128i scale_shuffle = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800); + const __m128i emask = _mm_set_epi32(0x80804040, 0x20201010, 0x08080404, 0x02020101); + const __m128i eshuffle = _mm_set_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200); + const __m512i shuffles[2] = { + _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_setzero_si512(), + _mm_set1_epi16(0x0100), 0), _mm_set1_epi16(0x0302), 1), _mm_set1_epi16(0x0504), 2), _mm_set1_epi16(0x0706), 3), + _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_setzero_si512(), + _mm_set1_epi16(0x0908), 0), _mm_set1_epi16(0x0b0a), 1), _mm_set1_epi16(0x0d0c), 2), _mm_set1_epi16(0x0f0e), 3) + }; +}; struct DequantizerIQ2K final : public BaseDequantizer { DequantizerIQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(IQXKScales(5, -32)), values(load_values()) {} @@ -809,7 +843,7 @@ struct DequantizerIQ2K final : public BaseDequantizer { }; struct DequantizerIQ3K final : public BaseDequantizer { - DequantizerIQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(IQXKScales(4, -64)), values(load_values()) {} + DequantizerIQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(4, -64), values(load_values()) {} template inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { d = GGML_FP16_TO_FP32(x[i].d); @@ -844,7 +878,7 @@ struct DequantizerIQ3K final : public BaseDequantizer { return _mm_sign_epi8(scl, sch); } Q2Bits bits; - const IQXKScales iqxk; + const IQXKScales2 iqxk; const __m512i values; const __m512i hmask = _mm512_set1_epi8(4); @@ -885,7 +919,7 @@ struct DequantizerIQ4K final : public BaseDequantizer { } Q4Bits bits; - const IQXKScales iqxk; + const IQXKScales2 iqxk; const __m512i values; const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); @@ -948,7 +982,7 @@ struct DequantizerIQ5K final : public BaseDequantizer { } Q4Bits bits; - const IQXKScales iqxk; + const IQXKScales2 iqxk; __m512i values[2]; const __m512i hmask1 = _mm512_set1_epi8(1); const __m512i hmask2 = _mm512_set1_epi8(2); @@ -959,6 +993,64 @@ struct DequantizerIQ5K final : public BaseDequantizer { const __m128i m32 = _mm_set1_epi8(-32); }; +struct DequantizerIQ6K final : public BaseDequantizer { + DequantizerIQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(1, -128) { load_values(values); } + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + prepare(x[i].qs, x[i].qh); + auto scales8 = _mm_loadu_si128((const __m128i*)x[i].scales); + iqxk.process(i, d, x[i].extra, _mm256_cvtepi8_epi16(scales8), q8, accm, scales); + } + inline __m512i make_one(__m512i l, __m512i h) const { + auto p = _mm512_shuffle_epi8(values[0], l); + p = _mm512_mask_shuffle_epi8(p, _mm512_cmpeq_epi8_mask(_mm512_and_si512(h, masks[0]), masks[0]), values[1], l); + p = _mm512_mask_shuffle_epi8(p, _mm512_cmpeq_epi8_mask(_mm512_and_si512(h, masks[1]), masks[1]), values[2], l); + p = _mm512_mask_shuffle_epi8(p, _mm512_cmpeq_epi8_mask(_mm512_and_si512(h, masks[2]), masks[2]), values[3], l); + return p; + } + inline void prepare(const uint8_t * q4, const uint8_t * qh) { + bits.prepare64(q4); + auto h256_1 = _mm256_loadu_si256((const __m256i *)qh + 0); + auto h256_2 = _mm256_loadu_si256((const __m256i *)qh + 1); + auto h1 = _mm512_inserti32x8(_mm512_castsi256_si512(h256_1), _mm256_srli_epi16(h256_1, 4), 1); + auto h2 = _mm512_inserti32x8(_mm512_castsi256_si512(h256_2), _mm256_srli_epi16(h256_2, 4), 1); + bits.values[0] = make_one(bits.values[0], h1); + bits.values[1] = make_one(bits.values[1], _mm512_srli_epi16(h1, 2)); + bits.values[2] = make_one(bits.values[2], h2); + bits.values[3] = make_one(bits.values[3], _mm512_srli_epi16(h2, 2)); + // We now have in bits.valuse[0]: 0...31, 64...95 + // bits.valuse[1]: 32..63, 96..127 + // etc. + auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]); + bits.values[1] = _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1]); + bits.values[0] = tmp; + tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]); + bits.values[3] = _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3]); + bits.values[2] = tmp; + } + static void load_values(__m512i * values) { + static const uint8_t kvalues_iq6nl[64] = { + 1, 7, 13, 19, 24, 30, 35, 40, 44, 49, 54, 58, 62, 66, 70, 74, + 77, 81, 84, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 117, 120, 123, + 126, 128, 131, 134, 137, 140, 142, 145, 148, 151, 155, 158, 161, 164, 168, 172, + 175, 179, 183, 187, 191, 196, 200, 205, 210, 215, 220, 226, 231, 237, 243, 249, + }; + for (int k = 0; k < 4; ++k) { + auto values128 = _mm_loadu_si128((const __m128i *)kvalues_iq6nl + k); + auto values256 = MM256_SET_M128I(values128, values128); + values[k] = _mm512_inserti32x8(_mm512_castsi256_si512(values256), values256, 1); + } + } + + Q4Bits bits; + IQXKScales2 iqxk; + __m512i values[4]; + __m512i masks[3] = { _mm512_set1_epi8(0x01), _mm512_set1_epi8(0x02), _mm512_set1_epi8(0x03) }; + const __m512i permute1 = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4); +}; + template inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) { const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0)); @@ -1033,6 +1125,50 @@ static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const Da } } +template +static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[4]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(), + p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} + template static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n % QK_K == 0); @@ -1218,6 +1354,20 @@ struct IQXKScales { template inline void process(int i, float d, uint16_t extra, __m128i scales8, const Q8& q8, __m256 * accm, __m256i * scales) const { auto scales16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales8, hshuff)); + process(i, d, extra, scales16, q8, accm, scales); + //auto extra128 = _mm_set1_epi16(extra); + //extra128 = _mm_cmpeq_epi8(_mm_and_si128(extra128, emask), emask); + //extra128 = _mm_and_si128(extra128, eshift); + //extra128 = _mm_shuffle_epi8(extra128, eshuffle); + //auto scales_s = _mm256_mullo_epi16(scales16, _mm256_add_epi16(min, _mm256_cvtepi8_epi16(extra128))); + //for (int iy = 0; iy < Q8::nrc_y; ++iy) { + // const __m256i prod = _mm256_madd_epi16(scales_s, q8.load_bsums(iy, i)); + // accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]); + //} + //prepare_scales_16(scales16, scales); + } + template + inline void process(int i, float d, uint16_t extra, __m256i scales16, const Q8& q8, __m256 * accm, __m256i * scales) const { auto extra128 = _mm_set1_epi16(extra); extra128 = _mm_cmpeq_epi8(_mm_and_si128(extra128, emask), emask); extra128 = _mm_and_si128(extra128, eshift); @@ -1229,6 +1379,7 @@ struct IQXKScales { } prepare_scales_16(scales16, scales); } + const __m256i min; const __m128i eshift; const __m128i hshuff = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800); @@ -1393,6 +1544,56 @@ struct DequantizerIQ5K final : public BaseDequantizer { const __m256i mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing }; +struct DequantizerIQ6K final : public BaseDequantizer { + DequantizerIQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx), iqxk(1, -128) { load_values(values); } + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + auto scales8 = _mm_loadu_si128((const __m128i*)x[i].scales); + auto scales16 = _mm256_cvtepi8_epi16(scales8); + iqxk.process(i, d, x[i].extra, scales16, q8, accm, scales); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j); + for (int k = 0; k < 4; ++k) { + bits.values[k] = make_one(bits.values[k], hbits); + hbits = _mm256_srli_epi16(hbits, 2); + } + } + inline __m256i make_one(__m256i l, __m256i hbits) const { + auto mask4 = _mm256_cmpeq_epi8(_mm256_and_si256(hbits, mh3), mh3); + auto h1 = _mm256_andnot_si256(mask4, hbits); + auto mask2 = _mm256_cmpeq_epi8(_mm256_and_si256(h1, mh1), mh1); + auto mask3 = _mm256_cmpeq_epi8(_mm256_and_si256(h1, mh2), mh2); + auto mask1 = _mm256_andnot_si256(_mm256_or_si256(mask4, _mm256_or_si256(mask2, mask3)), _mm256_set1_epi8(0xff)); + return _mm256_or_si256(_mm256_or_si256(_mm256_and_si256(mask1, _mm256_shuffle_epi8(values[0], l)), + _mm256_and_si256(mask2, _mm256_shuffle_epi8(values[1], l))), + _mm256_or_si256(_mm256_and_si256(mask3, _mm256_shuffle_epi8(values[2], l)), + _mm256_and_si256(mask4, _mm256_shuffle_epi8(values[3], l)))); + } + static void load_values(__m256i * values) { + static const uint8_t kvalues_iq6nl[64] = { + 1, 7, 13, 19, 24, 30, 35, 40, 44, 49, 54, 58, 62, 66, 70, 74, + 77, 81, 84, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 117, 120, 123, + 126, 128, 131, 134, 137, 140, 142, 145, 148, 151, 155, 158, 161, 164, 168, 172, + 175, 179, 183, 187, 191, 196, 200, 205, 210, 215, 220, 226, 231, 237, 243, 249, + }; + for (int k = 0; k < 4; ++k) { + auto values128 = _mm_loadu_si128((const __m128i *)kvalues_iq6nl + k); + values[k] = MM256_SET_M128I(values128, values128); + } + } + + Q4Bits bits; + const IQXKScales iqxk; + __m256i values[4]; + const __m256i mh1 = _mm256_set1_epi8(1); + const __m256i mh2 = _mm256_set1_epi8(2); + const __m256i mh3 = _mm256_set1_epi8(3); + const __m256i mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing +}; + struct DequantizerQ5K final : public BaseDequantizer { DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} template @@ -3184,14 +3385,28 @@ template void MulMat::set_functions(MulMat& m) { } else { #ifdef HAVE_FANCY_SIMD - m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; - m.funcs[1] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[2] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[3] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[4] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[5] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[6] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[7] = mul_mat_qX_K_q8_K_AVX512; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512; + } else { + m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; + m.funcs[1] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[2] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[3] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[4] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[5] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[6] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[7] = mul_mat_qX_K_q8_K_AVX512; + } #else if constexpr (std::is_same_v || std::is_same_v || @@ -3199,7 +3414,8 @@ template void MulMat::set_functions(MulMat& m) { std::is_same_v|| std::is_same_v|| std::is_same_v|| - std::is_same_v) { + std::is_same_v|| + std::is_same_v) { m.funcs[0] = mul_mat_qY_K_q8_K_T; m.funcs[1] = mul_mat_qY_K_q8_K_T; m.funcs[2] = mul_mat_qY_K_q8_K_T; @@ -3317,6 +3533,10 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { assert (ne00 % QK_K == 0); MulMat::set_functions(mm); break; + case GGML_TYPE_IQ6_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; case GGML_TYPE_IQ3_S: assert (ne00 % QK_K == 0); MulMat::set_functions(mm); @@ -3991,6 +4211,41 @@ struct DequantizerIQ5K final : public BaseDequantizer { float d; }; +struct DequantizerIQ6K final : public BaseDequantizer { + DequantizerIQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(vld1q_s8_x4(iq6nl_values)) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + return Scale16Extra::new_block(i, d, x[i].extra, 1, vld1q_s8(x[i].scales), q8, acc); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+64*j); + auto hbits = vld1q_u8_x2(x[i].qh + 32*j); + bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), hm)); + bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), hm)); + bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), hm)); + bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), hm)); + bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], hm)); + bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], hm)); + bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), hm)); + bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), hm)); + for (int k = 0; k < 4; ++k) { + bits.b1.val[k] = vqtbl4q_s8(values, bits.b1.val[k]); + bits.b2.val[k] = vqtbl4q_s8(values, bits.b2.val[k]); + } + } + + Q4bits bits; + const int8x16x4_t values; + const uint8x16_t hm = vdupq_n_u8(0x30); + + float d; +}; + struct DequantizerIQ2K final : public BaseDequantizer { DequantizerIQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} @@ -5579,6 +5834,9 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) { case GGML_TYPE_IQ5_K: MulMat::set_functions(m); break; + case GGML_TYPE_IQ6_K: + MulMat::set_functions(m); + break; case GGML_TYPE_IQ2_K: MulMat::set_functions(m); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 1cba1532..e43ed999 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -1515,6 +1515,370 @@ size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_pe return nrows * nblock * sizeof(block_iq5_k); } +// +// ============================================== iq6_K +// +#define A_IQ6K -127.f +#define B_IQ6K 6.2568f +#define C_IQ6K 0.11218f +#define D_IQ6K 0.0011972f +#define S_IQ6K 1.f + +void dequantize_row_iq6_k(const block_iq6_k * x, float * y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const int8_t * sl = x[i].scales; + + uint16_t extra = x[i].extra; + + int shift = 0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + + float dl1 = d * sl[4*ib64 + 0]; + float dl2 = d * sl[4*ib64 + 1]; + float dl3 = d * sl[4*ib64 + 2]; + float dl4 = d * sl[4*ib64 + 3]; + float m1 = extra & 1 ? S_IQ6K : 0; + float m2 = extra & 2 ? S_IQ6K : 0; + float m3 = extra & 4 ? S_IQ6K : 0; + float m4 = extra & 8 ? S_IQ6K : 0; + for (int j = 0; j < 16; ++j) { + float q1 = ((qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 0x03) << 4)); + float q2 = ((qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 0x03) << 4)); + float q3 = ((qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 0x0c) << 2)); + float q4 = ((qs[j+16] >> 4) | (((qh[j+16] >> shift) & 0x0c) << 2)); + y[j+ 0] = dl1 * (A_IQ6K + q1*(B_IQ6K + q1*(-C_IQ6K + q1*D_IQ6K)) + m1); + y[j+16] = dl2 * (A_IQ6K + q2*(B_IQ6K + q2*(-C_IQ6K + q2*D_IQ6K)) + m2); + y[j+32] = dl3 * (A_IQ6K + q3*(B_IQ6K + q3*(-C_IQ6K + q3*D_IQ6K)) + m3); + y[j+48] = dl4 * (A_IQ6K + q4*(B_IQ6K + q4*(-C_IQ6K + q4*D_IQ6K)) + m4); + } + y += 64; + qs += 32; + extra >>= 4; + shift += 4; + if (shift == 8) { qh += 32; shift = 0; } + } + + } +} + +void vec_dot_iq6_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + + if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ6_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { + return; + } + + // TODO + //const int nb = n / QK_K; + + //const block_iq5_k * x = (const block_iq5_k *)vx; + //const block_q8_K * y = (const block_q8_K *)vy; + + //float sumf = 0; + + //for (int i = 0; i < nb; i++) { + + // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + // const uint8_t * qs = x[i].qs; + // const uint8_t * qh = x[i].qh; + // const uint8_t * sl = x[i].scales_l; + // const uint8_t * sh = x[i].scales_h; + // const int8_t * q8 = y[i].qs; + + // uint16_t extra = x[i].extra; + + // int shift = 0; + // int sumb = 0; + // for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + + // int dl1 = (((sl[2*ib64+0] & 0xf) | ((sh[ib64] << 4) & 0x30)) - 32); + // int dl2 = (((sl[2*ib64+0] >> 4) | ((sh[ib64] << 2) & 0x30)) - 32); + // int dl3 = (((sl[2*ib64+1] & 0xf) | ((sh[ib64] >> 0) & 0x30)) - 32); + // int dl4 = (((sl[2*ib64+1] >> 4) | ((sh[ib64] >> 2) & 0x30)) - 32); + // const int8_t * values1 = iq5nl_values + ((extra & 1) << 5); + // const int8_t * values2 = iq5nl_values + ((extra & 2) << 4); + // const int8_t * values3 = iq5nl_values + ((extra & 4) << 3); + // const int8_t * values4 = iq5nl_values + ((extra & 8) << 2); + // int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; + // for (int j = 0; j < 16; ++j) { + // sumi1 += q8[j+ 0] * values1[(qs[j+ 0] & 0xf) | (((qh[j+ 0] >> shift) & 1) << 4)]; + // sumi2 += q8[j+16] * values2[(qs[j+16] & 0xf) | (((qh[j+16] >> shift) & 1) << 4)]; + // sumi3 += q8[j+32] * values3[(qs[j+ 0] >> 4) | (((qh[j+ 0] >> shift) & 2) << 3)]; + // sumi4 += q8[j+48] * values4[(qs[j+16] >> 4) | (((qh[j+16] >> shift) & 2) << 3)]; + // } + // sumb += dl1 * sumi1 + dl2 * sumi2 + dl3 * sumi3 + dl4 * sumi4; + // q8 += 64; + // qs += 32; + // extra >>= 4; + // shift += 2; + // } + // sumf += d * sumb; + + //} + + //*s = sumf; + +} + +namespace { + +inline int best_index(int n, const float * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} +uint8_t iq6nl_index[249] = { + 0, 0, 0, 64, 1, 1, 1, 1, 1, 65, 2, 2, 2, 2, 2, 66, 3, 3, 3, 3, 67, 67, 4, 4, 4, 4, 68, 5, 5, 5, 5, 69, + 69, 6, 6, 6, 70, 70, 7, 7, 7, 71, 8, 8, 8, 72, 72, 9, 9, 9, 73, 73, 10, 10, 10, 74, 11, 11, 11, 75, 12, 12, 12, 76, + 13, 13, 13, 77, 14, 14, 14, 78, 15, 15, 79, 79, 16, 16, 80, 17, 17, 81, 81, 18, 18, 82, 19, 19, 83, 83, 20, 84, 84, 21, 85, 85, + 22, 86, 86, 23, 87, 87, 24, 88, 88, 25, 89, 89, 26, 90, 90, 27, 91, 91, 28, 92, 29, 93, 93, 30, 94, 94, 31, 95, 95, 32, 96, 33, + 97, 97, 34, 98, 98, 35, 99, 99, 36, 100, 100, 37, 101, 38, 102, 102, 39, 103, 103, 40, 104, 104, 41, 41, 105, 42, 42, 106, 106, 43, 107, 107, + 44, 108, 108, 45, 45, 109, 46, 46, 46, 110, 47, 47, 111, 111, 48, 48, 112, 49, 49, 49, 113, 50, 50, 50, 114, 51, 51, 51, 115, 52, 52, 52, + 116, 116, 53, 53, 53, 117, 54, 54, 54, 118, 118, 55, 55, 55, 119, 119, 56, 56, 56, 120, 120, 57, 57, 57, 121, 121, 58, 58, 58, 58, 122, 59, + 59, 59, 59, 123, 123, 60, 60, 60, 60, 124, 61, 61, 61, 61, 61, 125, 62, 62, 62, 62, 62, 126, 63, 63, 63, +}; +inline int best_index_iq6nl(const float * values, float x) { + int ix = (int)(x - values[0]); + if (ix < 0 || ix >= 249) return ix < 0 ? 0 : 63; + ix = iq6nl_index[ix]; + return ix < 64 ? ix : x - values[ix-64] < values[ix-63] - x ? ix-64 : ix-63; + //if (x <= val[0]) return 0; + //if (x >= val[63]) return 63; + //int index = iq6nl_index[int(x - val[0])]; + //return index < 64 ? index : x - val[index-64] < val[index-63] - x ? index - 64 : index - 63; +} + + +void quantize_row_iq6_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, const float * values, const float * shifted_values) { + const int ntry = 5; + const float step = 1.f; + + block_iq6_k * y = (block_iq6_k *)vy; + + float scales[QK_K/16]; + float weight[16]; + + for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq6_k)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + const float * xbl = x + ibl*QK_K; + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += xbl[j]*xbl[j]; + const float sigma2 = 2*sumx2/QK_K; + + float max_scale = 0, max_abs_scale = 0; + uint16_t extra = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float * xb = xbl + 16*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*16; + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float amax = 0, max = 0; + for (int j = 0; j < 16; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { + amax = ax; max = xb[j]; + } + } + if (!amax) { + scales[ib] = 0; + continue; + } + float d = ntry > 0 ? -max/values[0] : max/values[0]; + float id = 1/d; + float sumqx_p = 0, sumq2_p = 0; + float sumqx_m = 0, sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + //int l = best_index(64, values, al); + int l = best_index_iq6nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + //l = best_index(64, values, -al); + l = best_index_iq6nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + d = sumqx_p/sumq2_p; + float best = d*sumqx_p; + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d*sumqx_m; + } + bool is_shifted = false; + for (int itry = -ntry; itry <= ntry; ++itry) { + id = (itry*step + values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + //int l = best_index(64, values, al); + int l = best_index_iq6nl(values, al); + float q = values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + //l = best_index(64, values, -al); + l = best_index_iq6nl(values, -al); + q = values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; + } + id = (itry*step + shifted_values[0])/max; + sumqx_p = sumq2_p = 0; + sumqx_m = sumq2_m = 0; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + //int l = best_index(64, shifted_values, al); + int l = best_index_iq6nl(shifted_values, al); + float q = shifted_values[l]; + sumqx_p += w*q*xb[j]; + sumq2_p += w*q*q; + //l = best_index(64, shifted_values, -al); + l = best_index_iq6nl(shifted_values, -al); + q = shifted_values[l]; + sumqx_m += w*q*xb[j]; + sumq2_m += w*q*q; + } + if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { + d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; + } + if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { + d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; + } + } + if (d) { + const float * block_values = is_shifted ? shifted_values : values; + float sumqx = 0, sumq2 = 0; + id = 1/d; + for (int j = 0; j < 16; ++j) { + float w = weight[j]; + float al = id*xb[j]; + //int l = best_index(64, block_values, al); + int l = best_index_iq6nl(block_values, al); + float q = block_values[l]; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + if (sumq2 > 0) d = sumqx/sumq2; + } + scales[ib] = d; + if (is_shifted) extra |= (1 << ib); + + float abs_scale = fabsf(scales[ib]); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; max_scale = scales[ib]; + } + + } + + if (!max_abs_scale) continue; + float d = -max_scale/127; + y[ibl].d = GGML_FP32_TO_FP16(d); + y[ibl].extra = extra; + + float id = 1/d; + + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/16; ++ib) { + int ls = nearest_int(id*scales[ib]); + ls = MAX(-127, MIN(127, ls)); + y[ibl].scales[ib] |= ls; + float dl = d * ls; + if (dl) { + const float * block_values = y[ibl].extra & (1 << ib) ? shifted_values : values; + const float * xb = xbl + 16*ib; + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*16; + for (int j = 0; j < 16; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < 16; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + float idl = 1/dl; + int ib32 = ib/2; + int offset = 16*(ib%2); + uint8_t * qs = y[ibl].qs + 32*(ib32/2) + offset; + uint8_t * qh = y[ibl].qh + 32*(ib32/4) + offset; + for (int j = 0; j < 16; ++j) { + const float al = idl*xb[j]; + //int ibest = best_index(64, block_values, al); + int ibest = best_index_iq6nl(block_values, al); + qs[j] |= ((ibest & 0xf) << 4*(ib32%2)); + qh[j] |= ((ibest >> 4) << 2*(ib32%4)); + float w = weight[j]; + float q = block_values[ibest]*ls; + sumqx += w*q*xb[j]; + sumq2 += w*q*q; + } + } + } + if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(sumqx/sumq2); + + } +} + +} + +void quantize_row_iq6_k_ref(const float * x, block_iq6_k * y, int64_t k) { + assert(k % QK_K == 0); + quantize_iq6_k(x, (void *)y, 1, k, nullptr); +} + +void quantize_row_iq6_k(const float * x, void * vy, int64_t k) { + assert(k % QK_K == 0); + block_iq6_k * y = (block_iq6_k *)vy; + quantize_row_iq6_k_ref(x, y, k); +} + +size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + float values[128]; + for (int i = 0; i < 64; ++i) { + values[i] = iq6nl_values[i]; + values[i+64] = values[i] + S_IQ6K; + } + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq6_k_impl(src, (void *)qrow, n_per_row, imatrix, values, values + 64); + src += n_per_row; + qrow += nblock*sizeof(block_iq6_k); + } + return nrows * nblock * sizeof(block_iq6_k); +} + // // ========================== IQ2_TN // @@ -1582,13 +1946,6 @@ void dequantize_row_iq2_tn(const block_iq2_tn * x, float * y, int64_t k) { } void vec_dot_iq2_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - GGML_UNUSED(nrc); - GGML_UNUSED(bx); - GGML_UNUSED(by); - GGML_UNUSED(bs); - if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_TN, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index 80a9012b..3c5d27a4 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -37,6 +37,12 @@ size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, void dequantize_row_iq5_k(const block_iq5_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq5_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void quantize_row_iq6_k_ref(const float * GGML_RESTRICT x, block_iq6_k * GGML_RESTRICT y, int64_t k); +void quantize_row_iq6_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +void dequantize_row_iq6_k(const block_iq6_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void vec_dot_iq6_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void quantize_row_iq2_tn_ref(const float * GGML_RESTRICT x, block_iq2_tn * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_tn(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); size_t quantize_iq2_tn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/include/llama.h b/include/llama.h index a5a2deb1..9ae88060 100644 --- a/include/llama.h +++ b/include/llama.h @@ -174,7 +174,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_K = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_K = 40, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ5_K = 41, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_TN = 42, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ6_K = 42, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_TN = 43, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7a28314e..71be05f9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3766,6 +3766,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_K: ftype = LLAMA_FTYPE_MOSTLY_IQ3_K; break; case GGML_TYPE_IQ4_K: ftype = LLAMA_FTYPE_MOSTLY_IQ4_K; break; case GGML_TYPE_IQ5_K: ftype = LLAMA_FTYPE_MOSTLY_IQ5_K; break; + case GGML_TYPE_IQ6_K: ftype = LLAMA_FTYPE_MOSTLY_IQ6_K; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; @@ -4465,6 +4466,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_K: return "IQ3_K - 3.4325 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_K: return "IQ4_K - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; @@ -15418,7 +15420,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) { new_type = GGML_TYPE_Q5_K; } - else if (new_type != GGML_TYPE_Q8_0) { + else if (new_type != GGML_TYPE_Q8_0 && new_type != GGML_TYPE_IQ6_K) { new_type = GGML_TYPE_Q6_K; } } @@ -15645,7 +15647,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_IQ4_K || new_type == GGML_TYPE_IQ2_K || - new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_IQ2_TN) { + new_type == GGML_TYPE_IQ5_K || new_type == GGML_TYPE_IQ3_K || new_type == GGML_TYPE_IQ2_TN || + new_type == GGML_TYPE_IQ6_K) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -15680,6 +15683,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_IQ5_K: case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_IQ6_K: case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } @@ -15786,6 +15790,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_K: default_type = GGML_TYPE_IQ3_K; break; case LLAMA_FTYPE_MOSTLY_IQ4_K: default_type = GGML_TYPE_IQ4_K; break; case LLAMA_FTYPE_MOSTLY_IQ5_K: default_type = GGML_TYPE_IQ5_K; break; + case LLAMA_FTYPE_MOSTLY_IQ6_K: default_type = GGML_TYPE_IQ6_K; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;