Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IQ4_XS_R4 #123

Merged
merged 5 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
{ "IQ4_NL_X4",LLAMA_FTYPE_MOSTLY_IQ4_NL_X4," 4.50 bpw non-linear quantization", },
{ "IQ4_XS_R4",LLAMA_FTYPE_MOSTLY_IQ4_XS_R4," 4.25 bpw non-linear quantization", },
{ "Q4_0_R4", LLAMA_FTYPE_MOSTLY_Q4_0_R4, " 4.50 bpw quantization", },
{ "Q5_0_R4", LLAMA_FTYPE_MOSTLY_Q5_0_R4, " 5.50 bpw quantization", },
{ "Q6_0_R4", LLAMA_FTYPE_MOSTLY_Q6_0_R4, " 6.50 bpw quantization", },
Expand Down
4 changes: 3 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,8 @@ extern "C" {
GGML_TYPE_Q4_0_R4 = 202,
GGML_TYPE_Q5_0_R4 = 206,
GGML_TYPE_Q8_0_R4 = 208,
GGML_TYPE_IQ4_NL_X4 = 220,
GGML_TYPE_IQ4_NL_X4 = 220, // TODO: rename GGML_TYPE_IQ4_NL_X4 to GGML_TYPE_IQ4_NL_R4
GGML_TYPE_IQ4_XS_R4 = 223,
GGML_TYPE_Q6_0_R4 = 233,
GGML_TYPE_COUNT,
};
Expand Down Expand Up @@ -475,6 +476,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q8_0_R4 = 207, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0_R4 = 208, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL_X4 = 219, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
};

Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,14 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

typedef struct {
ggml_half d[4];
uint8_t scales_h[QK_K/32];
uint8_t scales_l[QK_K/16];
uint8_t qs[QK_K*2];
} block_iq4_xs_r4;
static_assert(sizeof(block_iq4_xs_r4) == 4*sizeof(ggml_half) + QK_K/32 + QK_K/16 + QK_K*2, "wrong iq4_xs_rs block size/padding");

typedef struct {
uint8_t scales[QK_K/32];
uint8_t qs[QK_K/2];
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -15197,6 +15197,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ4_KS: break;
case GGML_TYPE_IQ4_KSS: break;
case GGML_TYPE_IQ4_NL_X4: break;
case GGML_TYPE_IQ4_XS_R4: break;
case GGML_TYPE_Q4_0_R4: break;
case GGML_TYPE_Q5_0_R4: break;
case GGML_TYPE_Q6_0_R4: break;
Expand Down
22 changes: 22 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_IQ4_XS_R4] = {
.type_name = "iq4_xs_r4",
.blck_size = QK_K,
.type_size = sizeof(block_iq4_xs),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs_r4,
.from_float = quantize_row_iq4_xs_r4,
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_r4_ref,
.vec_dot = vec_dot_iq4_xs_r4_q8_k,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_Q4_0_R4] = {
.type_name = "q4_0_r4",
.blck_size = QK4_NL,
Expand Down Expand Up @@ -3989,6 +4002,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ2_BN: wtype = GGML_TYPE_IQ2_BN; break;
case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
case GGML_FTYPE_MOSTLY_IQ4_NL_X4: wtype = GGML_TYPE_IQ4_NL_X4;break;
case GGML_FTYPE_MOSTLY_IQ4_XS_R4: wtype = GGML_TYPE_IQ4_XS_R4;break;
case GGML_FTYPE_MOSTLY_Q4_0_R4: wtype = GGML_TYPE_Q4_0_R4; break;
case GGML_FTYPE_MOSTLY_Q5_0_R4: wtype = GGML_TYPE_Q5_0_R4; break;
case GGML_FTYPE_MOSTLY_Q6_0_R4: wtype = GGML_TYPE_Q6_0_R4; break;
Expand Down Expand Up @@ -10517,6 +10531,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -10964,6 +10979,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -11108,6 +11124,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -14298,6 +14315,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -14682,6 +14700,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -14960,6 +14979,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -15565,6 +15585,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_NL_X4:
case GGML_TYPE_IQ4_XS_R4:
case GGML_TYPE_Q4_0_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
Expand Down Expand Up @@ -22396,6 +22417,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL_X4: result = quantize_iq4_nl_x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS_R4: result = quantize_iq4_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_R4: result = quantize_q4_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q5_0_R4: result = quantize_q5_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_0_R4: result = quantize_q6_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
Expand Down
Loading