From 9ef07800622e4c371605f9419864d15667c3558f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Jun 2024 20:27:13 +0200 Subject: [PATCH 01/12] Fix new line issue with chat template, disable template when in-prefix/suffix is set (#8203) * preserve new line llama_chat_format_single * disable chat template if in-prefix/suffix is set * remove redundant change --- common/common.cpp | 16 +++++++++++++--- common/common.h | 1 + examples/main/main.cpp | 11 +++++++---- tests/test-chat-template.cpp | 4 ++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6a00d25be1316a..5a0d0ee0381239 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1014,16 +1014,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; + params.enable_chat_template = false; return true; } if (arg == "--in-prefix") { CHECK_ARG params.input_prefix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--in-suffix") { CHECK_ARG params.input_suffix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--spm-infill") { @@ -1406,7 +1409,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "halt generation at PROMPT, return control in interactive mode\n" "can be specified more than once for multiple prompts" }); options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" }); options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); @@ -2668,12 +2671,19 @@ std::string llama_chat_format_single(const struct llama_model * model, const std::vector & past_msg, const llama_chat_msg & new_msg, bool add_ass) { + std::ostringstream ss; auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg chat_new.push_back(new_msg); auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); - auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); - return formatted; + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); } std::string llama_chat_format_example(const struct llama_model * model, diff --git a/common/common.h b/common/common.h index d6cb814b990e9b..627b7ed854757e 100644 --- a/common/common.h +++ b/common/common.h @@ -200,6 +200,7 @@ struct gpt_params { std::string public_path = ""; std::string chat_template = ""; std::string system_prompt = ""; + bool enable_chat_template = true; std::vector api_keys; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1114073b84370d..d512953b9635c8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -261,7 +261,7 @@ int main(int argc, char ** argv) { std::vector embd_inp; { - auto prompt = params.conversation + auto prompt = (params.conversation && params.enable_chat_template) ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { @@ -810,7 +810,9 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + if (params.enable_chat_template) { + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + } is_interacting = true; printf("\n"); } @@ -872,12 +874,13 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } - std::string user_inp = params.conversation + bool format_chat = params.conversation && params.enable_chat_template; + std::string user_inp = format_chat ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index b154038b2d5c03..03f5369109716f 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -142,9 +142,9 @@ int main(void) { std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; return output; }; - assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); + assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); assert(fmt_single("llama2") == "[INST] How are you [/INST]"); - assert(fmt_single("gemma") == "user\nHow are you\nmodel\n"); + assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); return 0; From d09ecb84c8300504bb76794298c0ef47d541a733 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 30 Jun 2024 23:40:25 +0200 Subject: [PATCH 02/12] replace list with single tensor --- src/llama.cpp | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d8852cfe494af4..71b7ef622019e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2081,7 +2081,7 @@ struct llama_hparams { bool use_par_res; uint32_t n_vocab; - uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; uint32_t n_head; uint32_t n_head_kv; @@ -2665,7 +2665,7 @@ struct llama_context { struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] // KQ mask per layer, used by sliding window attention (gemma 2) - std::vector inp_KQ_mask_l; + struct ggml_tensor * inp_KQ_mask_SWA; // control vectors struct llama_control_vector cvec; @@ -7794,6 +7794,7 @@ struct llm_build_context { lctx.inp_s_copy = nullptr; lctx.inp_s_mask = nullptr; lctx.inp_s_seq = nullptr; + lctx.inp_KQ_mask_SWA = nullptr; } void free() { @@ -7946,15 +7947,18 @@ struct llm_build_context { return lctx.inp_out_ids; } - struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { - if (causal) { - lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + struct ggml_tensor * build_inp_KQ_mask(bool causal = true, bool sliding_window = false) { + struct ggml_tensor * KQ_mask = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + cb(KQ_mask, "KQ_mask", -1); + ggml_set_input(KQ_mask); + if (sliding_window) { + lctx.inp_KQ_mask_SWA = KQ_mask; } else { - lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + lctx.inp_KQ_mask = KQ_mask; } - cb(lctx.inp_KQ_mask, "KQ_mask", -1); - ggml_set_input(lctx.inp_KQ_mask); - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; + return flash_attn ? ggml_cast(ctx0, KQ_mask, GGML_TYPE_F16) : KQ_mask; } struct ggml_tensor * build_inp_mean() { @@ -11038,14 +11042,12 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) // gemma 2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask_full = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_SWA = build_inp_KQ_mask(); - lctx.inp_KQ_mask_l.clear(); + struct ggml_tensor * KQ_mask_full = build_inp_KQ_mask(true, false); + struct ggml_tensor * KQ_mask_SWA = build_inp_KQ_mask(true, true); for (int il = 0; il < n_layer; ++il) { // (il % 2) layers use SWA struct ggml_tensor * KQ_mask = (il % 2 == 0) ? KQ_mask_SWA : KQ_mask_full; - lctx.inp_KQ_mask_l.push_back(KQ_mask); // norm cur = llm_build_norm(ctx0, inpL, hparams, @@ -12685,15 +12687,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) lctx.inp_KQ_mask->data; float * data_swa = nullptr; const llama_pos n_keep_swa = hparams.n_sliding - batch.n_tokens; if (lctx.model.arch == LLM_ARCH_GEMMA2) { - GGML_ASSERT(!lctx.inp_KQ_mask_l.empty() && "gemma 2 requires different KQ mask per layer"); + GGML_ASSERT(lctx.inp_KQ_mask_SWA); GGML_ASSERT(hparams.n_sliding > 0); - data_swa = (float *) lctx.inp_KQ_mask_l[0]->data; - data = (float *) lctx.inp_KQ_mask_l[1]->data; + data = (float *) lctx.inp_KQ_mask->data; + data_swa = (float *) lctx.inp_KQ_mask_SWA->data; // because layer masks are alternate for gemma 2, we only need to take first 2 layers } From d0a7145ba99ed3a8bc3145aa785b5c86ffe65020 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 02:09:34 +0300 Subject: [PATCH 03/12] flake.lock: Update (#8218) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 79bb3f63fdc6d0..973ff4e5675c77 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1718895438, - "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=", + "lastModified": 1719506693, + "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3", + "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a", "type": "github" }, "original": { From ed5496fb32e9888abeaa7672aaba4d4251671457 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 1 Jul 2024 12:35:47 +0200 Subject: [PATCH 04/12] update --- src/llama.cpp | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 71b7ef622019e3..1f676357333ffa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2101,7 +2101,7 @@ struct llama_hparams { uint32_t n_ff_shexp = 0; uint32_t n_expert_shared = 0; float expert_weights_scale = 0.0; - uint32_t n_sliding = 0; // sliding window attention (SWA) + uint32_t n_swa = 0; // sliding window attention (SWA) float f_norm_eps; float f_norm_rms_eps; @@ -2665,7 +2665,7 @@ struct llama_context { struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] // KQ mask per layer, used by sliding window attention (gemma 2) - struct ggml_tensor * inp_KQ_mask_SWA; + struct ggml_tensor * inp_KQ_mask_swa; // control vectors struct llama_control_vector cvec; @@ -4715,8 +4715,8 @@ static void llm_load_hparams( } break; case LLM_ARCH_GEMMA2: { - hparams.n_sliding = 4096; // default value of gemma 2 - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_sliding, false); + hparams.n_swa = 4096; // default value of gemma 2 + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); @@ -7794,7 +7794,7 @@ struct llm_build_context { lctx.inp_s_copy = nullptr; lctx.inp_s_mask = nullptr; lctx.inp_s_seq = nullptr; - lctx.inp_KQ_mask_SWA = nullptr; + lctx.inp_KQ_mask_swa = nullptr; } void free() { @@ -7954,7 +7954,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); ggml_set_input(KQ_mask); if (sliding_window) { - lctx.inp_KQ_mask_SWA = KQ_mask; + lctx.inp_KQ_mask_swa = KQ_mask; } else { lctx.inp_KQ_mask = KQ_mask; } @@ -12689,14 +12689,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { float * data = (float *) lctx.inp_KQ_mask->data; float * data_swa = nullptr; - const llama_pos n_keep_swa = hparams.n_sliding - batch.n_tokens; - if (lctx.model.arch == LLM_ARCH_GEMMA2) { - GGML_ASSERT(lctx.inp_KQ_mask_SWA); - GGML_ASSERT(hparams.n_sliding > 0); - data = (float *) lctx.inp_KQ_mask->data; - data_swa = (float *) lctx.inp_KQ_mask_SWA->data; - // because layer masks are alternate for gemma 2, we only need to take first 2 layers + if (lctx.inp_KQ_mask_swa) { + data_swa = (float *) lctx.inp_KQ_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -12722,7 +12717,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // may need to cut off old tokens for sliding window if (data_swa) { - if (pos - lctx.kv_self.cells[i].pos > n_keep_swa) { + if (pos - lctx.kv_self.cells[i].pos >= (int32_t)hparams.n_swa) { f = -INFINITY; } data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f; From 197fe6c1d7bec6718ce901f0141b2725240f298c Mon Sep 17 00:00:00 2001 From: zhentaoyu Date: Mon, 1 Jul 2024 19:39:06 +0800 Subject: [PATCH 05/12] [SYCL] Update SYCL-Rope op and Refactor (#8157) * align with rope.cu and move sycl-op to a single file --- ggml/src/ggml-sycl.cpp | 305 +-------------------------------- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/rope.cpp | 275 +++++++++++++++++++++++++++++ ggml/src/ggml-sycl/rope.hpp | 22 +++ 4 files changed, 300 insertions(+), 303 deletions(-) create mode 100644 ggml/src/ggml-sycl/rope.cpp create mode 100644 ggml/src/ggml-sycl/rope.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4a668a2c34d3ea..30d8a5b33b6133 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -978,114 +978,6 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, cpy_blck(cx + x_offset, cdst + dst_offset); } -static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); - return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); -} - -struct rope_corr_dims { - float v[4]; -}; - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static void rope_yarn( - float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta -) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); - } - *cos_theta = sycl::cos(theta) * mscale; - *sin_theta = sycl::sin(theta) * mscale; -} - -// rope == RoPE == rotary positional embedding -template -static void rope( - const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, - float ext_factor, float attn_factor, rope_corr_dims corr_dims -, - const sycl::nd_item<3> &item_ct1) { - const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - - if (col >= ncols) { - return; - } - - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i = row*ncols + col; - const int i2 = row/p_delta_rows; - - const int p = has_pos ? pos[i2] : 0; - const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols); - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + 1]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + 1] = x0*sin_theta + x1*cos_theta; -} - -template -static void rope_neox( - const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, - const float * freq_factors, const sycl::nd_item<3> &item_ct1) { - const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - - if (col >= ncols) { - return; - } - - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int ib = col / n_dims; - const int ic = col % n_dims; - - if (ib > 0) { - const int i = row*ncols + ib*n_dims + ic; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - - const int i = row*ncols + ib*n_dims + ic/2; - const int i2 = row/p_delta_rows; - - float cur_rot = inv_ndims * ic - ib; - - const int p = has_pos ? pos[i2] : 0; - const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f; - - const float theta_base = - p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor; - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims/2]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; -} - static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(1); @@ -2241,110 +2133,6 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min, }); } -template -static void rope_sycl(const T *x, T *dst, int ncols, int nrows, - const int32_t *pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, - rope_corr_dims corr_dims, queue_ptr stream) { - GGML_ASSERT(ncols % 2 == 0); - const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); - const sycl::range<3> block_nums(1, num_blocks_x, nrows); - if (pos == nullptr) { - /* - DPCT1049:40: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope(x, dst, ncols, pos, freq_scale, p_delta_rows, - freq_base, ext_factor, attn_factor, corr_dims, - item_ct1); - }); - } else { - /* - DPCT1049:41: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope(x, dst, ncols, pos, freq_scale, p_delta_rows, - freq_base, ext_factor, attn_factor, corr_dims, - item_ct1); - }); - } -} - -template -static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows, - const int32_t *pos, float freq_scale, - int p_delta_rows, float freq_base, float ext_factor, - float attn_factor, rope_corr_dims corr_dims, - const float * freq_factors, queue_ptr stream) { - GGML_ASSERT(ncols % 2 == 0); - const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); - const sycl::range<3> block_nums(1, num_blocks_x, nrows); - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float inv_ndims = -1.0f / n_dims; - - if (pos == nullptr) { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - if (freq_factors == nullptr) { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, - item_ct1); - }); - } else { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, - item_ct1); - }); - } - } else { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - if (freq_factors == nullptr) { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); - }); - } else { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); - }); - } - } -} - static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, const int nrows, queue_ptr stream) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); @@ -3461,97 +3249,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - const ggml_tensor * src2 = dst->src[2]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - GGML_ASSERT(src0->type == dst->type); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t nrows = ggml_nrows(src0); - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - //const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - // RoPE alteration for extended context - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - const float * freq_factors = nullptr; - const int32_t * pos = nullptr; - if ((mode & 1) == 0) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(src1->ne[0] == ne2); - pos = (const int32_t *) src1_dd; - } - - const bool is_neox = mode & 2; - -#pragma message("TODO: update rope NORM mode to match NEOX mode") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634") - - if (is_neox) { - pos = (const int32_t *) src1_dd; - - if (src2 != nullptr) { - freq_factors = (const float *) src2->data; - } - } else { - GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox"); - } - - rope_corr_dims corr_dims; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); - - // compute - if (is_neox) { - if (src0->type == GGML_TYPE_F32) { - rope_neox_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, - ne00, n_dims, nrows, pos, freq_scale, ne01, - freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, main_stream); - } else { - GGML_ASSERT(false); - } - } else { - if (src0->type == GGML_TYPE_F32) { - rope_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, - nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream); - } else { - GGML_ASSERT(false); - } - } - - (void) src1; - (void) dst; - (void) src1_dd; -} - static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -6241,7 +5938,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: + return true; case GGML_OP_ROPE: + return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 2d37e271f90508..d5a63cd710cc34 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -19,5 +19,6 @@ #include "dmmv.hpp" #include "mmq.hpp" #include "mmvq.hpp" +#include "rope.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp new file mode 100644 index 00000000000000..eabf1693e2d2f4 --- /dev/null +++ b/ggml/src/ggml-sycl/rope.cpp @@ -0,0 +1,275 @@ +#include "rope.hpp" + +struct rope_corr_dims { + float v[2]; +}; + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); + return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); + } + *cos_theta = sycl::cos(theta) * mscale; + *sin_theta = sycl::sin(theta) * mscale; +} + +template +static void rope_norm( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_neox( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0/2; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_norm_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + /* + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + /* + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +template +static void rope_neox_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) { + const ggml_tensor * src2 = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t nr = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + //const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const bool is_neox = mode & 2; + + const int32_t * pos = (const int32_t *) src1_dd; + + const float * freq_factors = nullptr; + if (src2 != nullptr) { + freq_factors = (const float *) src2->data; + } + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_norm_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_norm_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } + + (void) src1; + (void) dst; + (void) src1_dd; +} diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp new file mode 100644 index 00000000000000..00354c3131bd77 --- /dev/null +++ b/ggml/src/ggml-sycl/rope.hpp @@ -0,0 +1,22 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_ROPE_HPP +#define GGML_SYCL_ROPE_HPP + +#include "common.hpp" + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); + +#endif // GGML_SYCL_ROPE_HPP From 694c59cb42d1ebd6a7d912ca65d3d7363e0f14c9 Mon Sep 17 00:00:00 2001 From: iacore <74560659+iacore@users.noreply.github.com> Date: Mon, 1 Jul 2024 11:40:58 +0000 Subject: [PATCH 06/12] Document BERT support. (#8205) * Update README.md document BERT support * Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 99b16f6e29e329..153d837e374eaf 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well. - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) +- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) From 257f8e41e24b5bbfc27d9e907189a3e0cdb650d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 14:46:18 +0300 Subject: [PATCH 07/12] nix : remove OpenCL remnants (#8235) * nix : remove OpenCL remnants * minor : remove parentheses --- .devops/nix/package.nix | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 4ee0d62cb49d9a..b75d7ff9e5bab5 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -17,18 +17,15 @@ rocmPackages, vulkan-headers, vulkan-loader, - clblast, useBlas ? builtins.all (x: !x) [ useCuda useMetalKit - useOpenCL useRocm useVulkan ] && blas.meta.available, useCuda ? config.cudaSupport, - useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, + useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMpi ? false, # Increases the runtime closure size by ~700M - useOpenCL ? false, useRocm ? config.rocmSupport, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -56,7 +53,6 @@ let ++ lib.optionals useCuda [ "CUDA" ] ++ lib.optionals useMetalKit [ "MetalKit" ] ++ lib.optionals useMpi [ "MPI" ] - ++ lib.optionals useOpenCL [ "OpenCL" ] ++ lib.optionals useRocm [ "ROCm" ] ++ lib.optionals useVulkan [ "Vulkan" ]; @@ -198,7 +194,6 @@ effectiveStdenv.mkDerivation ( optionals effectiveStdenv.isDarwin darwinBuildInputs ++ optionals useCuda cudaBuildInputs ++ optionals useMpi [ mpi ] - ++ optionals useOpenCL [ clblast ] ++ optionals useRocm rocmBuildInputs ++ optionals useBlas [ blas ] ++ optionals useVulkan vulkanBuildInputs; @@ -210,7 +205,6 @@ effectiveStdenv.mkDerivation ( (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) - (cmakeBool "GGML_CLBLAST" useOpenCL) (cmakeBool "GGML_CUDA" useCuda) (cmakeBool "GGML_HIPBLAS" useRocm) (cmakeBool "GGML_METAL" useMetalKit) @@ -254,7 +248,6 @@ effectiveStdenv.mkDerivation ( useCuda useMetalKit useMpi - useOpenCL useRocm useVulkan ; @@ -281,7 +274,7 @@ effectiveStdenv.mkDerivation ( # Configurations we don't want even the CI to evaluate. Results in the # "unsupported platform" messages. This is mostly a no-op, because # cudaPackages would've refused to evaluate anyway. - badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; + badPlatforms = optionals useCuda lib.platforms.darwin; # Configurations that are known to result in build failures. Can be # overridden by importing Nixpkgs with `allowBroken = true`. From 3840b6f593751a0ba636bfda73b630cd6c29d7b5 Mon Sep 17 00:00:00 2001 From: Michael Francis Date: Mon, 1 Jul 2024 07:47:04 -0400 Subject: [PATCH 08/12] nix : enable curl (#8043) Co-authored-by: Georgi Gerganov --- .devops/nix/package.nix | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index b75d7ff9e5bab5..49e9b75287b338 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -17,6 +17,7 @@ rocmPackages, vulkan-headers, vulkan-loader, + curl, useBlas ? builtins.all (x: !x) [ useCuda useMetalKit @@ -27,6 +28,7 @@ useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMpi ? false, # Increases the runtime closure size by ~700M useRocm ? config.rocmSupport, + enableCurl ? true, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -196,13 +198,15 @@ effectiveStdenv.mkDerivation ( ++ optionals useMpi [ mpi ] ++ optionals useRocm rocmBuildInputs ++ optionals useBlas [ blas ] - ++ optionals useVulkan vulkanBuildInputs; + ++ optionals useVulkan vulkanBuildInputs + ++ optionals enableCurl [ curl ]; cmakeFlags = [ (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_CURL" enableCurl) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) From 0ddeff10230b88f1fa9866bbe5fe0d71ba2323a0 Mon Sep 17 00:00:00 2001 From: Roni Date: Mon, 1 Jul 2024 14:48:16 +0200 Subject: [PATCH 09/12] readme : update tool list (#8209) * Added gppm to Tool list in README * Update README.md --------- Co-authored-by: Georgi Gerganov --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 153d837e374eaf..c136d4a5cb9c95 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: **Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML +[crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption --- From ce711f6eae1079592e060e47ec23b6a795387ce6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 18:26:24 +0300 Subject: [PATCH 10/12] llama : minor styling --- src/llama.cpp | 89 +++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1f676357333ffa..eea532f6ac2ff3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2087,6 +2087,7 @@ struct llama_hparams { uint32_t n_head_kv; uint32_t n_layer; uint32_t n_rot; + uint32_t n_swa = 0; // sliding window attention (SWA) uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_ff; @@ -2101,7 +2102,6 @@ struct llama_hparams { uint32_t n_ff_shexp = 0; uint32_t n_expert_shared = 0; float expert_weights_scale = 0.0; - uint32_t n_swa = 0; // sliding window attention (SWA) float f_norm_eps; float f_norm_rms_eps; @@ -2142,6 +2142,7 @@ struct llama_hparams { if (this->n_head_kv != other.n_head_kv) return true; if (this->n_layer != other.n_layer) return true; if (this->n_rot != other.n_rot) return true; + if (this->n_swa != other.n_swa) return true; if (this->n_embd_head_k != other.n_embd_head_k) return true; if (this->n_embd_head_v != other.n_embd_head_v) return true; if (this->n_ff != other.n_ff) return true; @@ -2652,20 +2653,18 @@ struct llama_context { void * abort_callback_data = nullptr; // input tensors - struct ggml_tensor * inp_tokens; // I32 [n_batch] - struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] - struct ggml_tensor * inp_pos; // I32 [n_batch] - struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] - struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - struct ggml_tensor * inp_cls; // I32 [n_batch] - struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] - - // KQ mask per layer, used by sliding window attention (gemma 2) - struct ggml_tensor * inp_KQ_mask_swa; + struct ggml_tensor * inp_tokens; // I32 [n_batch] + struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + struct ggml_tensor * inp_pos; // I32 [n_batch] + struct ggml_tensor * inp_out_ids; // I32 [n_outputs] + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] + struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + struct ggml_tensor * inp_cls; // I32 [n_batch] + struct ggml_tensor * inp_s_copy; // I32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] // control vectors struct llama_control_vector cvec; @@ -5427,6 +5426,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); + LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v); LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); @@ -7783,18 +7783,18 @@ struct llm_build_context { ctx0 = ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; + lctx.inp_tokens = nullptr; + lctx.inp_embd = nullptr; + lctx.inp_pos = nullptr; + lctx.inp_out_ids = nullptr; + lctx.inp_KQ_mask = nullptr; lctx.inp_KQ_mask_swa = nullptr; + lctx.inp_K_shift = nullptr; + lctx.inp_mean = nullptr; + lctx.inp_cls = nullptr; + lctx.inp_s_copy = nullptr; + lctx.inp_s_mask = nullptr; + lctx.inp_s_seq = nullptr; } void free() { @@ -7813,7 +7813,6 @@ struct llm_build_context { cb(lctx.inp_K_shift, "K_shift", -1); ggml_set_input(lctx.inp_K_shift); - for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * tmp = @@ -7947,18 +7946,26 @@ struct llm_build_context { return lctx.inp_out_ids; } - struct ggml_tensor * build_inp_KQ_mask(bool causal = true, bool sliding_window = false) { - struct ggml_tensor * KQ_mask = causal + struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { + lctx.inp_KQ_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(KQ_mask, "KQ_mask", -1); - ggml_set_input(KQ_mask); - if (sliding_window) { - lctx.inp_KQ_mask_swa = KQ_mask; - } else { - lctx.inp_KQ_mask = KQ_mask; - } - return flash_attn ? ggml_cast(ctx0, KQ_mask, GGML_TYPE_F16) : KQ_mask; + cb(lctx.inp_KQ_mask, "KQ_mask", -1); + ggml_set_input(lctx.inp_KQ_mask); + + return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; + } + + struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) { + GGML_ASSERT(hparams.n_swa > 0); + + lctx.inp_KQ_mask_swa = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(lctx.inp_KQ_mask_swa); + + return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa; } struct ggml_tensor * build_inp_mean() { @@ -11042,12 +11049,12 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) // gemma 2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask_full = build_inp_KQ_mask(true, false); - struct ggml_tensor * KQ_mask_SWA = build_inp_KQ_mask(true, true); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); + struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); for (int il = 0; il < n_layer; ++il) { // (il % 2) layers use SWA - struct ggml_tensor * KQ_mask = (il % 2 == 0) ? KQ_mask_SWA : KQ_mask_full; + struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask; // norm cur = llm_build_norm(ctx0, inpL, hparams, @@ -11084,7 +11091,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } cur = llm_build_norm(ctx0, cur, hparams, From 7dc9cbf03fd7d3b3a977872f7bae0fe6cbfad5db Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 18:38:24 +0300 Subject: [PATCH 11/12] convert : add sanity check for query_pre_attn_scalar --- convert-hf-to-gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 27fc9eea6d69b6..4a7f500ff7d5ca 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2371,6 +2371,11 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + # sanity check + attn_scalar = self.hparams["query_pre_attn_scalar"] + if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]: + raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head") + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unusem From e24328ea1ada2d298ea8a64a141d13d240a87319 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 1 Jul 2024 17:54:34 +0200 Subject: [PATCH 12/12] fix small typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c136d4a5cb9c95..daba70717312e2 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: **Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML -[crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption +- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption ---