Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix llama minitron #307

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },

{
"name": "arm64-windows-msvc", "hidden": true,
Expand Down Expand Up @@ -60,6 +61,8 @@
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
]
}
64 changes: 57 additions & 7 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,41 @@

using json = nlohmann::ordered_json;

//
// Environment variable utils
//

template<typename T>
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::string(value) : target;
}

template<typename T>
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::stoi(value) : target;
}

template<typename T>
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::stof(value) : target;
}

template<typename T>
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
if (value) {
std::string val(value);
target = val == "1" || val == "true";
}
}

//
// CPU utils
//
Expand Down Expand Up @@ -220,12 +255,6 @@ int32_t cpu_get_num_math() {
// CLI argument parsing
//

void gpt_params_handle_hf_token(gpt_params & params) {
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
params.hf_token = std::getenv("HF_TOKEN");
}
}

void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
Expand Down Expand Up @@ -273,7 +302,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {

gpt_params_handle_model_default(params);

gpt_params_handle_hf_token(params);
if (params.hf_token.empty()) {
get_env("HF_TOKEN", params.hf_token);
}

if (params.escape) {
string_process_escapes(params.prompt);
Expand All @@ -293,6 +324,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
return true;
}

void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_THREADS", params.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
}

bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
const auto params_org = params; // the example can modify the default params

Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ struct gpt_params {
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};

void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_parse_from_env(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);

bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
Expand Down
4 changes: 2 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,7 +1570,7 @@ def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

factor = rope_scaling.get("factor", 8.0)
Expand Down Expand Up @@ -3816,7 +3816,7 @@ def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

factor = rope_scaling.get("factor", 8.0)
Expand Down
14 changes: 6 additions & 8 deletions docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,14 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

### Llama.cpp + SYCL

The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.

It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

## Recommended Release

The SYCL backend would be broken by some PRs due to no online CI.
Expand All @@ -45,6 +41,10 @@ The following release is verified with good quality:

## News


- 2024.8
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.

- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully.
Expand Down Expand Up @@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li

Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.

- **Adding support to Nvidia GPUs**

Expand Down Expand Up @@ -255,8 +255,6 @@ or
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh

# Build LLAMA with MKL BLAS acceleration for intel GPU

# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

Expand Down
19 changes: 19 additions & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,25 @@ logging:
--log-append Don't truncate the old log file.
```

Available environment variables (if specified, these variables will override parameters specified in arguments):

- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
- `LLAMA_ARG_MODEL`
- `LLAMA_ARG_THREADS`
- `LLAMA_ARG_CTX_SIZE`
- `LLAMA_ARG_N_PARALLEL`
- `LLAMA_ARG_BATCH`
- `LLAMA_ARG_UBATCH`
- `LLAMA_ARG_N_GPU_LAYERS`
- `LLAMA_ARG_THREADS_HTTP`
- `LLAMA_ARG_CHAT_TEMPLATE`
- `LLAMA_ARG_N_PREDICT`
- `LLAMA_ARG_ENDPOINT_METRICS`
- `LLAMA_ARG_ENDPOINT_SLOTS`
- `LLAMA_ARG_EMBEDDINGS`
- `LLAMA_ARG_FLASH_ATTN`
- `LLAMA_ARG_DEFRAG_THOLD`

## Build

Expand Down
3 changes: 3 additions & 0 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2507,6 +2507,9 @@ int main(int argc, char ** argv) {
return 1;
}

// parse arguments from environment variables
gpt_params_parse_from_env(params);

// TODO: not great to use extern vars
server_log_json = params.log_json;
server_verbose = params.verbosity > 0;
Expand Down
9 changes: 3 additions & 6 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1777,10 +1777,8 @@ extern "C" {

GGML_API struct ggml_tensor * ggml_ssm_conv(
struct ggml_context * ctx,
struct ggml_tensor * s,
struct ggml_tensor * x,
struct ggml_tensor * c,
struct ggml_tensor * sq);
struct ggml_tensor * sx,
struct ggml_tensor * c);

GGML_API struct ggml_tensor * ggml_ssm_scan(
struct ggml_context * ctx,
Expand All @@ -1789,8 +1787,7 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C,
struct ggml_tensor * sq);
struct ggml_tensor * C);

// partition into non-overlapping windows with padding if needed
// example:
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,13 @@ if (GGML_SYCL)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")

find_package(DNNL)
message("-- DNNL found:" ${DNNL_FOUND})
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
else()
add_compile_definitions(GGML_SYCL_DNNL=0)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
Expand All @@ -561,6 +568,9 @@ if (GGML_SYCL)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
endif()
endif()

if (GGML_RPC)
Expand Down
16 changes: 15 additions & 1 deletion ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#include "ggml-sycl/backend.hpp"
#include "ggml-sycl/presets.hpp"
#include "ggml-sycl/gemm.hpp"

bool ggml_sycl_loaded(void);
void ggml_sycl_free_data(struct ggml_tensor * tensor);
Expand Down Expand Up @@ -2482,6 +2483,7 @@ inline void ggml_sycl_op_mul_mat_sycl(

const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
#if !GGML_SYCL_DNNL
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
Expand All @@ -2491,6 +2493,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
dpct::library_data_t::real_half)));
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
#else
auto dnnl_stream = ctx.stream_dnnl(stream);
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
#endif
}
else {
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
Expand All @@ -2513,13 +2522,18 @@ inline void ggml_sycl_op_mul_mat_sycl(

const float alpha = 1.0f;
const float beta = 0.0f;

#if !GGML_SYCL_DNNL
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
dst_dd_i, ldc)));
#else
auto dnnl_stream = ctx.stream_dnnl(stream);
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
#endif
}
(void) dst;
(void) src1_ddq_i;
Expand Down
50 changes: 50 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
#include "dpct/helper.hpp"
#include "ggml-sycl.h"
#include "presets.hpp"
#if GGML_SYCL_DNNL
#include "dnnl.hpp"
#include "dnnl_sycl.hpp"
#endif

#define GGML_COMMON_DECL_SYCL
#define GGML_COMMON_IMPL_SYCL
Expand Down Expand Up @@ -277,6 +281,52 @@ struct ggml_backend_sycl_context {
return stream(device, 0);
}

#if GGML_SYCL_DNNL
dnnl::engine make_engine(sycl::queue* q) {
// Get the device associated with the queue
sycl::device dev = q->get_device();
// Get the context associated with the queue
sycl::context ctx = q->get_context();
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
return eng;
}

std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
dnnl::stream stream_dnnl(int device, int _stream) {
auto q = stream(device, _stream);
return stream_dnnl(q);
}
dnnl::engine engine_dnnl(sycl::queue* qptr) {
auto it = engine_map.find(qptr);
if (it == engine_map.end()) {
auto eng = make_engine(qptr);
engine_map[qptr] = eng;
return eng;
}
else
{
return it->second;
}
}
dnnl::stream stream_dnnl(sycl::queue* qptr) {
auto it = stream_map.find(qptr);
if (it == stream_map.end()) {
auto eng = engine_dnnl(qptr);
auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
stream_map[qptr] = stream;
return stream;
}
else
{
return it->second;
}
}
dnnl::stream stream_dnnl() {
return stream_dnnl(device, 0);
}
#endif

// pool
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];

Expand Down
Loading
Loading