Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sealad886 authored Jul 2, 2024
2 parents c375690 + a9f3b10 commit b0fdd11
Show file tree
Hide file tree
Showing 25 changed files with 1,418 additions and 1,354 deletions.
17 changes: 7 additions & 10 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
clblast,
curl,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

Expand All @@ -56,7 +55,6 @@ let
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];

Expand Down Expand Up @@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];

cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CLBLAST" useOpenCL)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
Expand Down Expand Up @@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
Expand All @@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
badPlatforms = optionals useCuda lib.platforms.darwin;

# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.
Expand Down
1 change: 1 addition & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
Expand Down Expand Up @@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
**Tools:**

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption

**Infrastructure:**

- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp

---

Expand Down
43 changes: 33 additions & 10 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_sentencepiece(self):
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def _create_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor

tokenizer_path = self.dir_model / 'tokenizer.model'
Expand Down Expand Up @@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
return tokens, scores, toktypes

def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
Expand Down Expand Up @@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA2

def set_vocab(self):
self._set_vocab_llama_hf()
tokens, scores, toktypes = self._create_vocab_sentencepiece()
# hack: This is required so that we can properly use start/end-of-turn for chat template
for i in range(108):
# including <unusedX>, <start_of_turn>, <end_of_turn>
toktypes[i] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
Expand All @@ -2369,6 +2386,12 @@ def set_gguf_parameters(self):
self.gguf_writer.add_final_logit_softcapping(
self.hparams["final_logit_softcapping"]
)
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])

# sanity check
attn_scalar = self.hparams["query_pre_attn_scalar"]
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unusem
Expand Down
8 changes: 6 additions & 2 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -486,9 +486,11 @@ if (GGML_SYCL)
add_compile_options(-I./) #include DPCT

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
endif()

file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
Expand Down Expand Up @@ -1166,7 +1168,9 @@ target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})

find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
if (NOT WIN32 OR NOT GGML_SYCL)
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
endif()
endif()

if (BUILD_SHARED_LIBS)
Expand Down
14 changes: 7 additions & 7 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
#define QR6_K 2

#define QI2_XXS (QK_K / (4*QR2_XXS))
#define QR2_XXS 8
#define QR2_XXS 4

#define QI2_XS (QK_K / (4*QR2_XS))
#define QR2_XS 8
#define QR2_XS 4

#define QI2_S (QK_K / (4*QR2_S))
#define QR2_S 8
#define QR2_S 4

#define QI3_XXS (QK_K / (4*QR3_XXS))
#define QR3_XXS 8
#define QR3_XXS 4

#define QI3_XS (QK_K / (4*QR3_XS))
#define QR3_XS 8
#define QR3_XS 4

#define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8
Expand All @@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
#define QR4_NL 2

#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8
#define QR4_XS 2

#define QI3_S (QK_K / (4*QR3_S))
#define QR3_S 8
#define QR3_S 4

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

Expand Down
12 changes: 5 additions & 7 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

bool any_gpus_with_slow_fp16 = false;

if (split) {
Expand All @@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}

const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
} else {
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}

// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

// debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
Expand Down
76 changes: 51 additions & 25 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "ggml.h"
#include "ggml-cuda.h"

#include <cstdint>
#include <memory>

#if defined(GGML_USE_HIPBLAS)
Expand Down Expand Up @@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
return c;
}

static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
v_add3_u32 %0, %1, %2, %0 \n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
v_add3_u32 %0, %1, %2, %0 \n \
"
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
: "v"(a), "v"(b)
);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
}
return c;
}

Expand Down Expand Up @@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
}
#endif // CUDART_VERSION < 12000

static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
v_add3_u32 %0, %1, %2, %0 \n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
v_add3_u32 %0, %1, %2, %0 \n \
"
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
: "v"(a), "v"(b)
);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c;

#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

#if __CUDA_ARCH__ >= MIN_CC_DP4A
return __dp4a(a, b, c);
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
const int8_t * a8 = (const int8_t *) &a;
const int8_t * b8 = (const int8_t *) &b;
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);

Expand Down
Loading

0 comments on commit b0fdd11

Please sign in to comment.