Skip to content

Commit

Permalink
add mul_mat_q parameter
Browse files Browse the repository at this point in the history
This also fixes a crash when loading the 70b llama2 model.

This parameter was introduced in ggml-org/llama.cpp#2453 (`0728c5a8`)
  • Loading branch information
bretello committed Aug 3, 2023
1 parent 91bf8fa commit ad2ea65
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ class llama_token_data_array(Structure):

# // Keep the booleans together to avoid misalignment during copy-by-value.
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels
# bool f16_kv; // use fp16 for KV cache
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
# bool vocab_only; // only load the vocabulary, no weights
Expand All @@ -203,6 +204,7 @@ class llama_context_params(Structure):
("progress_callback", llama_progress_callback),
("progress_callback_user_data", c_void_p),
("low_vram", c_bool),
("mul_mat_q", c_bool),
("f16_kv", c_bool),
("logits_all", c_bool),
("vocab_only", c_bool),
Expand Down

0 comments on commit ad2ea65

Please sign in to comment.