add base F08 support

same as https://github.com/Djip007/llama.cpp/tree/feature/fp8 no perf only minimal support. Add E5M2 / E4M3 for native distributed model E4M3_Q / E3M4_Q for create gguf quantized model
Djip007 · Oct 27, 2024 · cbd3abd · cbd3abd
1 parent b0efa25
commit cbd3abd
Show file tree

Hide file tree

Showing 10 changed files with 506 additions and 92 deletions.
diff --git a/llama.cpp/BUILD.mk b/llama.cpp/BUILD.mk
@@ -49,6 +49,8 @@ o/$(MODE)/llama.cpp/ggml-vector-arm80.o \
 o/$(MODE)/llama.cpp/ggml-vector-arm82.o: \
 		private CCFLAGS += -O3 -mgcc
 
+o/$(MODE)/llama.cpp/ggml-fp8.o: private CCFLAGS += -O3 -mgcc -fopenmp-simd
+
 o/$(MODE)/llama.cpp/ggml-alloc.o			\
 o/$(MODE)/llama.cpp/ggml-backend.o			\
 o/$(MODE)/llama.cpp/grammar-parser.o			\

diff --git a/llama.cpp/ggml-common.h b/llama.cpp/ggml-common.h
@@ -8,6 +8,15 @@ typedef uint32_t ggml_half2;
 
 #define GGML_COMMON_AGGR
 
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+#define GGML_COMMON_AGGR data
+
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_METAL)
 #include <metal_stdlib>
@@ -411,6 +420,25 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+// fp8 support
+// - fp8 simple type
+typedef struct { uint8_t bits; } ggml_e5m2_t;
+typedef struct { uint8_t bits; } ggml_e4m3_t;
+typedef struct { uint8_t bits; } ggml_e3m4_t;
+
+// - fp8 with bloc delta => 8.125 bpw
+typedef struct {
+    float d;  // delta
+    uint8_t qs[QK_K];
+} block_e4m3_q;
+static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
+
+typedef struct {
+    float d;  // delta
+    uint8_t qs[QK_K];
+} block_e3m4_q;
+static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
@@ -424,6 +452,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
 #define GGML_TABLE_END() };
 
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_METAL)
 #include <metal_stdlib>

diff --git a/llama.cpp/ggml-fp8.cpp b/llama.cpp/ggml-fp8.cpp
@@ -0,0 +1,268 @@
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml.h"
+
+#include "ggml-fp8.h"
+
+//#define target_clones(x86,arm) __attribute__((__target_clones__(x86 ",default")))
+//#define target_clones(x86,arm) __attribute__((__target_clones__(arm ",default")))
+//target_clones("arch=znver4,avx512f,avx2","simd")
+#ifdef __x86_64__
+#define target_clones() __attribute__((__target_clones__("arch=znver4,avx512f,avx2,default"),flatten))
+#elif defined(__aarch64__)
+// https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html
+// result in: undefined reference to `__aarch64_cpu_features'
+//#define target_clones() __attribute__((__target_clones__("simd,default"),flatten))
+#define target_clones() __attribute__((flatten))
+#else
+#define target_clones()
+#endif
+
+/*
+> build:
+make clean
+make -j16
+make -j16 install PREFIX=/home/philou/LLM/usr/
+
+./usr/bin/llamafile-quantize Mistral-Nemo-Instruct-2407.BF16.gguf Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
+
+./usr/bin/llamafile -m Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -c 128 -n 16 -t 0 -s 42 -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]"
+
+*/
+
+template<int N> constexpr float EXP2() {
+    if constexpr (N==0) return 1;
+    if constexpr (N>0) return EXP2<N-1>()*2;
+    if constexpr (N<0) return EXP2<N+1>()/2;
+}
+
+// 2^N avec N>0 en entier
+template<int N> constexpr int EXP_I2() {
+    if constexpr (N==0) return 1;
+    if constexpr (N>0) return EXP_I2<N-1>()*2;
+}
+
+template<int _E> //, int M=7-E>  1.7 bits!
+struct FP8 {
+    uint8_t bits;
+    using type = FP8<_E>;
+    static constexpr int E=_E;
+    static constexpr int M=7-_E;
+    static constexpr int E_BIAS=EXP2<_E-1>()-1;
+    static constexpr float MAX() { return (2-EXP2<-M+1>())*EXP2<EXP_I2<_E-1>()>(); }
+    static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); }
+    //=============================================
+
+    #pragma omp declare simd
+    void operator=(float value) {
+        union {
+            float f;
+            uint32_t bits;
+        } in = {value};
+        // le signe:
+        bits = (in.bits >> 24) & 0x80;
+        // la valeur sans la signe!
+        in.bits &= 0x7fffffff;
+        //GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
+        if (in.f >= MAX()) {
+            bits |= 0x7E;
+        } else if (in.f<MIN()) { // => 0.
+            // OK: S.0000000
+        } else {
+            in.f *= EXP2<E_BIAS-127>();
+            in.bits += 1<<(22-M); // for rounding
+            bits |= (in.bits >> (23-M)) & 0x7F;
+        }
+    }
+
+    #pragma omp declare simd
+    operator float () const {
+        union {
+            float f;
+            uint32_t bits;
+        } out = {0};
+        // le signe:
+        out.bits = bits & 0x80;
+        out.bits <<= 24;
+        uint32_t _bits = bits & 0x7F;
+        _bits <<= (23-M);
+        out.bits |= _bits;
+        out.f *= EXP2<127-E_BIAS>();
+        return out.f;
+    }
+};
+
+template<int E>
+target_clones()
+static inline void conv(const FP8<E>* x, float* y, int64_t size) {
+    #pragma omp simd
+    for (int64_t i=0; i<size; i++) {
+        y[i] = (float) x[i];
+    }
+}
+
+// [[ attribute ]]
+template<int E>
+target_clones()
+static inline void conv(const float* x, FP8<E>* y, int64_t size) {
+    #pragma omp simd
+    for (int64_t i=0; i<size; i++) {
+        y[i] = x[i];
+    }
+}
+
+template<int E>
+target_clones()
+static inline float dot(const FP8<E>* x, const float* y, int64_t size) {
+    float z = 0;
+    #pragma omp simd reduction(+:z)
+    for (int64_t i=0; i<size; i++) {
+        z += ((float)x[i])*y[i];
+    }
+    return z;
+}
+
+template <int E, int QK>
+struct bloc_fp8 {
+    float d;
+    FP8<E> qs[QK];
+};
+
+template <int E, int QK>
+target_clones()
+static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        #pragma omp simd
+        for (int64_t i=0; i<QK; i++) {
+            y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d;
+        }
+    }
+}
+
+template <int E, int QK>
+target_clones()
+static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        float m = 0;
+        #pragma omp simd reduction(max:m)
+        for (int64_t i=0; i<QK; i++) {
+            m = std::max(std::abs(x[q*QK+i]),m);
+        }
+        const float D = FP8<E>::MAX()/m;
+        y[q].d = m/FP8<E>::MAX();
+        #pragma omp simd
+        for (int64_t i=0; i<QK; i++) {
+            y[q].qs[i] = x[q*QK+i]*D;
+        }
+    }
+}
+
+template <int E, int QK>
+target_clones()
+static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) {
+    float z = 0;
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        float z0 = 0;
+        #pragma omp simd reduction(+:z0)
+        for (int64_t i=0; i<QK; i++) {
+            z0 += ((float)x[q].qs[i])*y[q*QK+i];
+        }
+        z += (x[q]).d * z0;
+    }
+    return z;
+}
+
+// the C API.
+void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    conv(reinterpret_cast<const FP8<5>*>(x), y, k);
+}
+void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
+    conv(x, reinterpret_cast<FP8<5>*>(y), k);
+}
+void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
+    for (int64_t i =0; i<k; ++i) {
+        reinterpret_cast<FP8<5>*>(y)[i] = x[i];
+    }
+}
+
+void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    conv(reinterpret_cast<const FP8<4>*>(x), y, k);
+}
+void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
+    conv(x, reinterpret_cast<FP8<4>*>(y), k);
+}
+void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
+    for (int64_t i =0; i<k; ++i) {
+        reinterpret_cast<FP8<4>*>(y)[i] = x[i];
+    }
+}
+
+void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
+}
+void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
+}
+void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
+}
+
+void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
+}
+void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
+}
+void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
+}
+
+// the dot product for FP8 weight
+void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
+}
diff --git a/llama.cpp/ggml-fp8.h b/llama.cpp/ggml-fp8.h
@@ -0,0 +1,33 @@
+// this is more a .inc.
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // Note: types are define in ggml-common.h
+
+    GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    GGML_API void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
+    GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
+
+    GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    GGML_API void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
+    GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
+
+    GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    GGML_API void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
+    GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
+
+    GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    GGML_API void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
+    GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
+
+    // TODO: the best depend on the CPU fp32 / bf16 / fp16
+#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32
+    GGML_API void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
+    GGML_API void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
+    GGML_API void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
+    GGML_API void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/llama.cpp/ggml-quants.inc b/llama.cpp/ggml-quants.inc
@@ -14931,6 +14931,26 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
             } break;
+        case GGML_TYPE_E4M3_Q:
+        case GGML_TYPE_E3M4_Q:
+            {
+                // Note realy clean, but it is the same test for E4M3.
+                const block_e3m4_q * q = (const block_e3m4_q *) data;
+                int nans = 0;
+                for (size_t i = 0; i < nb; ++i) {
+                    if (!validate_float(q[i].d, i)) {
+                        return false;
+                    }
+                    // NAN?
+                    for (size_t k = 0; k < QK_K; ++k) {
+                        nans += (q[i].qs[k] & 0x7f) == 0x7f;
+                    }
+                }
+                if (nans) {
+                    fprintf(stderr, "%s: found %d NaNs in row of %zu FP8 values\n", __func__, nans, nb*QK_K);
+                    return false;
+                }
+            } break;
 
         case GGML_TYPE_I8:
         case GGML_TYPE_I16: