From bfd2f21fb43525a8757a8c9e44032fd14bac222b Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Fri, 28 Jun 2024 20:38:12 -0400 Subject: [PATCH] bitnet : replace 1.58b with b1.58, as in the paper --- convert-hf-to-gguf.py | 2 +- examples/quantize/quantize.cpp | 4 ++-- ggml/src/ggml-common.h | 2 +- ggml/src/ggml-quants.c | 2 +- src/llama.cpp | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2bf0967ce4f91..eb5aaebac63af 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -300,7 +300,7 @@ def write_tensors(self): if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: # TODO: cleaner model-specific per-tensor types - # NOTE: Q1_3 is only relevant for BitNet 1.58b + # NOTE: Q1_3 is only relevant for BitNet b1.58 if ( self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and gguf.can_quantize_to_q1_3(data) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 43241df6087c7..aed39a4d00777 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -26,8 +26,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet 1.58b", }, - { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet 1.58b", }, + { "Q1_3", LLAMA_FTYPE_MOSTLY_Q1_3, " 1.63 bpw for BitNet b1.58", }, + { "Q2_2", LLAMA_FTYPE_MOSTLY_Q2_2, " 2.00 bpw for BitNet b1.58", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 9c680e3b1c05f..71901565158fa 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -137,7 +137,7 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP -// 1.625 bpw for BitNet 1.58b models +// 1.625 bpw for BitNet b1.58 models #define QK1_3 64 typedef struct { uint8_t q[(QK1_3 - 4*QK1_3/64)/5]; // 5 elements per byte (3^5 = 243 < 256) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 5dd682b602d56..e1197f4733b51 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3366,7 +3366,7 @@ size_t quantize_q2_2(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -// ====================== 1.625 bpw (de)-quantization (BitNet 1.58b) +// ====================== 1.625 bpw (de)-quantization (BitNet b1.58) void quantize_row_q1_3_reference(const float * restrict x, block_q1_3 * restrict y, int64_t k) { assert(k % QK1_3 == 0); diff --git a/src/llama.cpp b/src/llama.cpp index fa2d97e65d472..750455e33509f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4186,8 +4186,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet 1.58b"; - case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet 1.58b"; + case LLAMA_FTYPE_MOSTLY_Q1_3: return "Q1_3 - 1.625 bpw for BitNet b1.58"; + case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2 - 2.000 bpw for BitNet b1.58"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: