Skip to content

Commit

Permalink
add base F08 support
Browse files Browse the repository at this point in the history
same as https://github.com/Djip007/llama.cpp/tree/feature/fp8
no perf only minimal support.

Add
E5M2 / E4M3 for native distributed model
E4M3_Q / E3M4_Q for create gguf quantized model
  • Loading branch information
Djip007 committed Oct 27, 2024
1 parent b0efa25 commit cbd3abd
Show file tree
Hide file tree
Showing 10 changed files with 506 additions and 92 deletions.
2 changes: 2 additions & 0 deletions llama.cpp/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ o/$(MODE)/llama.cpp/ggml-vector-arm80.o \
o/$(MODE)/llama.cpp/ggml-vector-arm82.o: \
private CCFLAGS += -O3 -mgcc

o/$(MODE)/llama.cpp/ggml-fp8.o: private CCFLAGS += -O3 -mgcc -fopenmp-simd

o/$(MODE)/llama.cpp/ggml-alloc.o \
o/$(MODE)/llama.cpp/ggml-backend.o \
o/$(MODE)/llama.cpp/grammar-parser.o \
Expand Down
35 changes: 35 additions & 0 deletions llama.cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ typedef uint32_t ggml_half2;

#define GGML_COMMON_AGGR

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>

typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

#define GGML_COMMON_AGGR data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
#include <metal_stdlib>
Expand Down Expand Up @@ -411,6 +420,25 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

// fp8 support
// - fp8 simple type
typedef struct { uint8_t bits; } ggml_e5m2_t;
typedef struct { uint8_t bits; } ggml_e4m3_t;
typedef struct { uint8_t bits; } ggml_e3m4_t;

// - fp8 with bloc delta => 8.125 bpw
typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e4m3_q;
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");

typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e3m4_q;
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand All @@ -424,6 +452,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>

#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>
Expand Down
268 changes: 268 additions & 0 deletions llama.cpp/ggml-fp8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#include <cassert>
#include <cmath>
#include <algorithm>

#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
#include "ggml.h"

#include "ggml-fp8.h"

//#define target_clones(x86,arm) __attribute__((__target_clones__(x86 ",default")))
//#define target_clones(x86,arm) __attribute__((__target_clones__(arm ",default")))
//target_clones("arch=znver4,avx512f,avx2","simd")
#ifdef __x86_64__
#define target_clones() __attribute__((__target_clones__("arch=znver4,avx512f,avx2,default"),flatten))
#elif defined(__aarch64__)
// https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html
// result in: undefined reference to `__aarch64_cpu_features'
//#define target_clones() __attribute__((__target_clones__("simd,default"),flatten))
#define target_clones() __attribute__((flatten))
#else
#define target_clones()
#endif

/*
> build:
make clean
make -j16
make -j16 install PREFIX=/home/philou/LLM/usr/
./usr/bin/llamafile-quantize Mistral-Nemo-Instruct-2407.BF16.gguf Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
./usr/bin/llamafile -m Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -c 128 -n 16 -t 0 -s 42 -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]"
*/

template<int N> constexpr float EXP2() {
if constexpr (N==0) return 1;
if constexpr (N>0) return EXP2<N-1>()*2;
if constexpr (N<0) return EXP2<N+1>()/2;
}

// 2^N avec N>0 en entier
template<int N> constexpr int EXP_I2() {
if constexpr (N==0) return 1;
if constexpr (N>0) return EXP_I2<N-1>()*2;
}

template<int _E> //, int M=7-E> 1.7 bits!
struct FP8 {
uint8_t bits;
using type = FP8<_E>;
static constexpr int E=_E;
static constexpr int M=7-_E;
static constexpr int E_BIAS=EXP2<_E-1>()-1;
static constexpr float MAX() { return (2-EXP2<-M+1>())*EXP2<EXP_I2<_E-1>()>(); }
static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); }
//=============================================

#pragma omp declare simd
void operator=(float value) {
union {
float f;
uint32_t bits;
} in = {value};
// le signe:
bits = (in.bits >> 24) & 0x80;
// la valeur sans la signe!
in.bits &= 0x7fffffff;
//GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
if (in.f >= MAX()) {
bits |= 0x7E;
} else if (in.f<MIN()) { // => 0.
// OK: S.0000000
} else {
in.f *= EXP2<E_BIAS-127>();
in.bits += 1<<(22-M); // for rounding
bits |= (in.bits >> (23-M)) & 0x7F;
}
}

#pragma omp declare simd
operator float () const {
union {
float f;
uint32_t bits;
} out = {0};
// le signe:
out.bits = bits & 0x80;
out.bits <<= 24;
uint32_t _bits = bits & 0x7F;
_bits <<= (23-M);
out.bits |= _bits;
out.f *= EXP2<127-E_BIAS>();
return out.f;
}
};

template<int E>
target_clones()
static inline void conv(const FP8<E>* x, float* y, int64_t size) {
#pragma omp simd
for (int64_t i=0; i<size; i++) {
y[i] = (float) x[i];
}
}

// [[ attribute ]]
template<int E>
target_clones()
static inline void conv(const float* x, FP8<E>* y, int64_t size) {
#pragma omp simd
for (int64_t i=0; i<size; i++) {
y[i] = x[i];
}
}

template<int E>
target_clones()
static inline float dot(const FP8<E>* x, const float* y, int64_t size) {
float z = 0;
#pragma omp simd reduction(+:z)
for (int64_t i=0; i<size; i++) {
z += ((float)x[i])*y[i];
}
return z;
}

template <int E, int QK>
struct bloc_fp8 {
float d;
FP8<E> qs[QK];
};

template <int E, int QK>
target_clones()
static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
#pragma omp simd
for (int64_t i=0; i<QK; i++) {
y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d;
}
}
}

template <int E, int QK>
target_clones()
static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float m = 0;
#pragma omp simd reduction(max:m)
for (int64_t i=0; i<QK; i++) {
m = std::max(std::abs(x[q*QK+i]),m);
}
const float D = FP8<E>::MAX()/m;
y[q].d = m/FP8<E>::MAX();
#pragma omp simd
for (int64_t i=0; i<QK; i++) {
y[q].qs[i] = x[q*QK+i]*D;
}
}
}

template <int E, int QK>
target_clones()
static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) {
float z = 0;
const auto qk_size = size / QK;
for (int64_t q=0; q<qk_size; ++q) {
float z0 = 0;
#pragma omp simd reduction(+:z0)
for (int64_t i=0; i<QK; i++) {
z0 += ((float)x[q].qs[i])*y[q*QK+i];
}
z += (x[q]).d * z0;
}
return z;
}

// the C API.
void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
conv(reinterpret_cast<const FP8<5>*>(x), y, k);
}
void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
conv(x, reinterpret_cast<FP8<5>*>(y), k);
}
void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
for (int64_t i =0; i<k; ++i) {
reinterpret_cast<FP8<5>*>(y)[i] = x[i];
}
}

void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
conv(reinterpret_cast<const FP8<4>*>(x), y, k);
}
void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
conv(x, reinterpret_cast<FP8<4>*>(y), k);
}
void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
for (int64_t i =0; i<k; ++i) {
reinterpret_cast<FP8<4>*>(y)[i] = x[i];
}
}

void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
}
void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
}
void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
}

void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
}
void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
}
void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
assert(k % QK_K == 0);
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
}

// the dot product for FP8 weight
void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n);
}

void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n);
}

void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
}

void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
*s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
}
33 changes: 33 additions & 0 deletions llama.cpp/ggml-fp8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// this is more a .inc.
#ifdef __cplusplus
extern "C" {
#endif

// Note: types are define in ggml-common.h

GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);

GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);

GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);

GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);

// TODO: the best depend on the CPU fp32 / bf16 / fp16
#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32
GGML_API void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
GGML_API void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
GGML_API void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
GGML_API void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);

#ifdef __cplusplus
}
#endif
20 changes: 20 additions & 0 deletions llama.cpp/ggml-quants.inc
Original file line number Diff line number Diff line change
Expand Up @@ -14931,6 +14931,26 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
} break;
case GGML_TYPE_E4M3_Q:
case GGML_TYPE_E3M4_Q:
{
// Note realy clean, but it is the same test for E4M3.
const block_e3m4_q * q = (const block_e3m4_q *) data;
int nans = 0;
for (size_t i = 0; i < nb; ++i) {
if (!validate_float(q[i].d, i)) {
return false;
}
// NAN?
for (size_t k = 0; k < QK_K; ++k) {
nans += (q[i].qs[k] & 0x7f) == 0x7f;
}
}
if (nans) {
fprintf(stderr, "%s: found %d NaNs in row of %zu FP8 values\n", __func__, nans, nb*QK_K);
return false;
}
} break;

case GGML_TYPE_I8:
case GGML_TYPE_I16:
Expand Down
Loading

0 comments on commit cbd3abd

Please sign in to comment.