From d4dc3d26fcbaf6be282e20169b5ade10c2d49a76 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Fri, 1 Dec 2023 20:35:02 +0500 Subject: [PATCH 01/11] Samplers sequence order w parameter --- common/common.cpp | 6 +++ common/sampling.cpp | 106 +++++++++++++++++++++++++++++++++-------- common/sampling.h | 36 +++++++------- examples/main/main.cpp | 1 + 4 files changed, 113 insertions(+), 36 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1dcc235eac0e6..b148e3a7823a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -280,6 +280,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; + } else if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.samplers_sequence = argv[i]; } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; diff --git a/common/sampling.cpp b/common/sampling.cpp index 1317024c2c11c..891d057361943 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -99,6 +99,42 @@ std::string llama_sampling_print(const llama_sampling_params & params) { return std::string(result); } +std::string llama_sampling_order_print(const llama_sampling_params & params) { + std::string result = "CFG -> Penalties "; + if (params.mirostat == 0){ + for (auto s : params.samplers_sequence){ + switch (s){ + case 'k':{ + result += "-> top_k "; + break; + } + case 'f':{ + result += "-> tfs_z "; + break; + } + case 'y':{ + result += "-> typical_p "; + break; + } + case 'p':{ + result += "-> top_p "; + break; + } + case 'm':{ + result += "-> min_p "; + break; + } + case 't':{ + result += "-> temp "; + break; + } + } + } + } else result += "-> mirostat "; + + return result; +} + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -108,20 +144,21 @@ llama_token llama_sampling_sample( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + const std::string samplers_sequence = params.samplers_sequence; auto & prev = ctx_sampling->prev; auto & cur = ctx_sampling->cur; @@ -188,12 +225,41 @@ llama_token llama_sampling_sample( // temperature sampling size_t min_keep = std::max(1, params.n_probs); - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); - llama_sample_temp (ctx_main, &cur_p, temp); + // llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); + // llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); + // llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); + // llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + // llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); + // llama_sample_temp (ctx_main, &cur_p, temp); + + for (auto s : samplers_sequence){ + switch (s){ + case 'k':{ + llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); + break; + } + case 'f':{ + llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); + break; + } + case 'y':{ + llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); + break; + } + case 'p':{ + llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + break; + } + case 'm':{ + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); + break; + } + case 't':{ + llama_sample_temp (ctx_main, &cur_p, temp); + break; + } + } + } id = llama_sample_token(ctx_main, &cur_p); diff --git a/common/sampling.h b/common/sampling.h index 7c9b8dcf23bcb..c4191a6e5b8cd 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -10,22 +10,23 @@ // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = true; // consider newlines as a repeatable token + std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp std::string grammar; // optional BNF-like grammar to constrain sampling @@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama // Print sampling parameters into a string std::string llama_sampling_print(const llama_sampling_params & params); +// Print sampling order into a string +std::string llama_sampling_order_print(const llama_sampling_params & params); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c5cdfbf21b954..c096f110b32c5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -437,6 +437,7 @@ int main(int argc, char ** argv) { } } LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); From 7601a49130bc18540edb86ca9550647d6283fbec Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Fri, 1 Dec 2023 20:45:43 +0500 Subject: [PATCH 02/11] Cleaned commented code --- common/sampling.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 891d057361943..71f92aa65fc5e 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -225,13 +225,6 @@ llama_token llama_sampling_sample( // temperature sampling size_t min_keep = std::max(1, params.n_probs); - // llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - // llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - // llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - // llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - // llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); - // llama_sample_temp (ctx_main, &cur_p, temp); - for (auto s : samplers_sequence){ switch (s){ case 'k':{ From d363df3444b866df6410229b331fc11de83c59b3 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Fri, 1 Dec 2023 21:43:21 +0500 Subject: [PATCH 03/11] Fixed formatting --- common/sampling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.h b/common/sampling.h index c4191a6e5b8cd..9872b8b389b66 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -10,8 +10,8 @@ // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled From bd08c8fab350cc4f991a7ab333f45d110e39cb08 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sat, 2 Dec 2023 16:20:27 +0500 Subject: [PATCH 04/11] Rewrote with unordered_map --- common/sampling.cpp | 142 ++++++++++++++++++++++++++------------------ common/sampling.h | 36 +++++++++++ 2 files changed, 121 insertions(+), 57 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 71f92aa65fc5e..03c70fe950cff 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,5 +1,7 @@ #include "sampling.h" +#include + struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); @@ -101,40 +103,96 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; + + std::unordered_map samplers_map_display { + {'k', "-> top_k "}, + {'f', "-> tfs_z "}, + {'y', "-> typical_p "}, + {'p', "-> top_p "}, + {'m', "-> min_p "}, + {'t', "-> temp "} + }; + if (params.mirostat == 0){ for (auto s : params.samplers_sequence){ - switch (s){ - case 'k':{ - result += "-> top_k "; - break; - } - case 'f':{ - result += "-> tfs_z "; - break; - } - case 'y':{ - result += "-> typical_p "; - break; - } - case 'p':{ - result += "-> top_p "; - break; - } - case 'm':{ - result += "-> min_p "; - break; - } - case 't':{ - result += "-> temp "; - break; - } - } + result += samplers_map_display[s]; } } else result += "-> mirostat "; return result; } +void sample_top_k( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); +} + +void sample_top_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const float top_p = params.top_p; + llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); +} + +void sample_tfs_z( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const float tfs_z = params.tfs_z; + llama_sample_tail_free (ctx_main, &cur_p, tfs_z, min_keep); +} + +void sample_typical_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const float typical_p = params.typical_p; + llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); +} + +void sample_min_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const float min_p = params.min_p; + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); +} + +void sample_temp( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep){ + + const float temp = params.temp; + llama_sample_temp (ctx_main, &cur_p, temp); +} + +std::unordered_map> samplers_map +{ + {'k', sample_top_k}, + {'f', sample_tfs_z}, + {'y', sample_typical_p}, + {'p', sample_top_p}, + {'m', sample_min_p}, + {'t', sample_temp} +}; + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -145,11 +203,6 @@ llama_token llama_sampling_sample( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; const float penalty_repeat = params.penalty_repeat; const float penalty_freq = params.penalty_freq; @@ -226,32 +279,7 @@ llama_token llama_sampling_sample( size_t min_keep = std::max(1, params.n_probs); for (auto s : samplers_sequence){ - switch (s){ - case 'k':{ - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - break; - } - case 'f':{ - llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - break; - } - case 'y':{ - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - break; - } - case 'p':{ - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - break; - } - case 'm':{ - llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); - break; - } - case 't':{ - llama_sample_temp (ctx_main, &cur_p, temp); - break; - } - } + samplers_map[s](params, ctx_main, cur_p, min_keep); } id = llama_sample_token(ctx_main, &cur_p); diff --git a/common/sampling.h b/common/sampling.h index 9872b8b389b66..0ce10b232233b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -84,6 +84,42 @@ std::string llama_sampling_print(const llama_sampling_params & params); // Print sampling order into a string std::string llama_sampling_order_print(const llama_sampling_params & params); +void sample_top_k( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + +void sample_top_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + +void sample_tfs_z( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + +void sample_typical_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + +void sample_min_p( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + +void sample_temp( + const llama_sampling_params & params, + struct llama_context * ctx_main, + llama_token_data_array & cur_p, + size_t & min_keep); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call From 8d2b4603d7ea8f64431c66d0345dfc468376031e Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sat, 2 Dec 2023 18:16:56 +0500 Subject: [PATCH 05/11] Revert and rewrite, too many problems and safeguards would be needed --- common/sampling.cpp | 156 ++++++++++++++++++++------------------------ common/sampling.h | 36 ---------- 2 files changed, 72 insertions(+), 120 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 03c70fe950cff..e47dd1f795d7e 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,7 +1,5 @@ #include "sampling.h" -#include - struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); @@ -103,96 +101,89 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; - - std::unordered_map samplers_map_display { - {'k', "-> top_k "}, - {'f', "-> tfs_z "}, - {'y', "-> typical_p "}, - {'p', "-> top_p "}, - {'m', "-> min_p "}, - {'t', "-> temp "} - }; - if (params.mirostat == 0){ for (auto s : params.samplers_sequence){ - result += samplers_map_display[s]; + switch (s){ + case 'k':{ + result += "-> top_k "; + break; + } + case 'f':{ + result += "-> tfs_z "; + break; + } + case 'y':{ + result += "-> typical_p "; + break; + } + case 'p':{ + result += "-> top_p "; + break; + } + case 'm':{ + result += "-> min_p "; + break; + } + case 't':{ + result += "-> temp "; + break; + } + default: break; + } } } else result += "-> mirostat "; return result; } -void sample_top_k( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ - - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); -} - -void sample_top_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ - - const float top_p = params.top_p; - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); -} - -void sample_tfs_z( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ - - const float tfs_z = params.tfs_z; - llama_sample_tail_free (ctx_main, &cur_p, tfs_z, min_keep); -} +// no reasons to expose this function in header +void sampler_queue( + struct llama_context * ctx_main, + const llama_sampling_params & params, + llama_token_data_array & cur_p, + size_t & min_keep) { + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); -void sample_typical_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const std::string samplers_sequence = params.samplers_sequence; + + for (auto s : samplers_sequence){ + switch (s){ + case 'k':{ + llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); + break; + } + case 'f':{ + llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); + break; + } + case 'y':{ + llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); + break; + } + case 'p':{ + llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + break; + } + case 'm':{ + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); + break; + } + case 't':{ + llama_sample_temp (ctx_main, &cur_p, temp); + break; + } + default: break; + } + } - const float typical_p = params.typical_p; - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); -} - -void sample_min_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ - - const float min_p = params.min_p; - llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); } -void sample_temp( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep){ - - const float temp = params.temp; - llama_sample_temp (ctx_main, &cur_p, temp); -} - -std::unordered_map> samplers_map -{ - {'k', sample_top_k}, - {'f', sample_tfs_z}, - {'y', sample_typical_p}, - {'p', sample_top_p}, - {'m', sample_min_p}, - {'t', sample_temp} -}; - llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -211,7 +202,6 @@ llama_token llama_sampling_sample( const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; const bool penalize_nl = params.penalize_nl; - const std::string samplers_sequence = params.samplers_sequence; auto & prev = ctx_sampling->prev; auto & cur = ctx_sampling->cur; @@ -278,9 +268,7 @@ llama_token llama_sampling_sample( // temperature sampling size_t min_keep = std::max(1, params.n_probs); - for (auto s : samplers_sequence){ - samplers_map[s](params, ctx_main, cur_p, min_keep); - } + sampler_queue(ctx_main, params, cur_p, min_keep); id = llama_sample_token(ctx_main, &cur_p); diff --git a/common/sampling.h b/common/sampling.h index 0ce10b232233b..9872b8b389b66 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -84,42 +84,6 @@ std::string llama_sampling_print(const llama_sampling_params & params); // Print sampling order into a string std::string llama_sampling_order_print(const llama_sampling_params & params); -void sample_top_k( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - -void sample_top_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - -void sample_tfs_z( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - -void sample_typical_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - -void sample_min_p( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - -void sample_temp( - const llama_sampling_params & params, - struct llama_context * ctx_main, - llama_token_data_array & cur_p, - size_t & min_keep); - // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call From ff8adc1196cc38e78046ce946096f0a3a16aeecc Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sat, 2 Dec 2023 20:52:56 +0500 Subject: [PATCH 06/11] Fixed code style --- common/sampling.cpp | 65 ++++++++++----------------------------------- 1 file changed, 14 insertions(+), 51 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index e47dd1f795d7e..51647bc8d3d26 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -104,31 +104,13 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) { if (params.mirostat == 0){ for (auto s : params.samplers_sequence){ switch (s){ - case 'k':{ - result += "-> top_k "; - break; - } - case 'f':{ - result += "-> tfs_z "; - break; - } - case 'y':{ - result += "-> typical_p "; - break; - } - case 'p':{ - result += "-> top_p "; - break; - } - case 'm':{ - result += "-> min_p "; - break; - } - case 't':{ - result += "-> temp "; - break; - } - default: break; + case 'k': result += "-> top_k "; break; + case 'f': result += "-> tfs_z "; break; + case 'y': result += "-> typical_p "; break; + case 'p': result += "-> top_p "; break; + case 'm': result += "-> min_p "; break; + case 't': result += "-> temp "; break; + default : break; } } } else result += "-> mirostat "; @@ -154,34 +136,15 @@ void sampler_queue( for (auto s : samplers_sequence){ switch (s){ - case 'k':{ - llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); - break; - } - case 'f':{ - llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); - break; - } - case 'y':{ - llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - break; - } - case 'p':{ - llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); - break; - } - case 'm':{ - llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); - break; - } - case 't':{ - llama_sample_temp (ctx_main, &cur_p, temp); - break; - } - default: break; + case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + default : break; } } - } llama_token llama_sampling_sample( From e6dc166566351381be678e57c3cb88192efb8fd7 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sat, 2 Dec 2023 21:06:21 +0500 Subject: [PATCH 07/11] Code style fixes according to review --- common/sampling.cpp | 18 +++++++++--------- common/sampling.h | 34 +++++++++++++++++----------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 51647bc8d3d26..a2c6454a99e44 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -156,15 +156,15 @@ llama_token llama_sampling_sample( const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; + const float temp = params.temp; + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; auto & prev = ctx_sampling->prev; auto & cur = ctx_sampling->cur; diff --git a/common/sampling.h b/common/sampling.h index 9872b8b389b66..fdfa9eed1467b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -10,23 +10,23 @@ // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token - std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = true; // consider newlines as a repeatable token + std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp std::string grammar; // optional BNF-like grammar to constrain sampling From 3fa6726351fc267ebe7e0524220d064b3d68f1d9 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sun, 3 Dec 2023 12:46:09 +0500 Subject: [PATCH 08/11] More readable samplers input string, fixed help --- common/common.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++++ common/common.h | 6 ++++++ 2 files changed, 56 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index b148e3a7823a5..75588e156b4c2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -280,6 +280,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; + } else if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.samplers_sequence = parse_samplers_input(argv[i]); } else if (arg == "--sampling-seq") { if (++i >= argc) { invalid_param = true; @@ -767,6 +773,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -892,6 +900,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) { GGML_UNREACHABLE(); } +// +// String parsing +// + +std::string parse_samplers_input(std::string input){ + std::string output = ""; + // since samplers names are written multiple ways + // make it ready for both system names and input names + std::unordered_map samplers_symbols{ + {"top_k", 'k'}, + {"top-k", 'k'}, + {"top_p", 'p'}, + {"top-p", 'p'}, + {"nucleus", 'p'}, + {"typical_p", 'y'}, + {"typical-p", 'y'}, + {"typical", 'y'}, + {"min_p", 'm'}, + {"min-p", 'm'}, + {"tfs_z", 'f'}, + {"tfs-z", 'f'}, + {"tfs", 'f'}, + {"temp", 't'}, + {"temperature",'t'} + }; + // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" + size_t separator = input.find(';'); + while (separator != input.npos){ + std::string name = input.substr(0,separator); + input = input.substr(separator+1); + separator = input.find(';'); + + if (samplers_symbols.find(name) != samplers_symbols.end()){ + output += samplers_symbols[name]; + } + } + if (samplers_symbols.find(input) != samplers_symbols.end()){ + output += samplers_symbols[input]; + } + return output; +} + // // Model utils // diff --git a/common/common.h b/common/common.h index 2f6fe48ab53d3..534f7b1322da2 100644 --- a/common/common.h +++ b/common/common.h @@ -141,6 +141,12 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); +// +// String parsing +// + +std::string parse_samplers_input(std::string input); + // // Model utils // From ddda6ddb37fd87bb8a357a248c896bedadcdaa75 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Sun, 3 Dec 2023 13:27:22 +0500 Subject: [PATCH 09/11] Style fix in sampler_queue --- common/sampling.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index a2c6454a99e44..57ead6607ea03 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -126,12 +126,12 @@ void sampler_queue( size_t & min_keep) { const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; const std::string samplers_sequence = params.samplers_sequence; for (auto s : samplers_sequence){ From a6c327884532822144a204430d2e721d047577e5 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Tue, 5 Dec 2023 14:14:39 +0500 Subject: [PATCH 10/11] Formatting fixes --- common/common.cpp | 10 +++++----- common/sampling.cpp | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 75588e156b4c2..b184fea099e05 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -904,11 +904,11 @@ std::string gpt_random_prompt(std::mt19937 & rng) { // String parsing // -std::string parse_samplers_input(std::string input){ +std::string parse_samplers_input(std::string input) { std::string output = ""; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map samplers_symbols{ + std::unordered_map samplers_symbols { {"top_k", 'k'}, {"top-k", 'k'}, {"top_p", 'p'}, @@ -927,16 +927,16 @@ std::string parse_samplers_input(std::string input){ }; // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" size_t separator = input.find(';'); - while (separator != input.npos){ + while (separator != input.npos) { std::string name = input.substr(0,separator); input = input.substr(separator+1); separator = input.find(';'); - if (samplers_symbols.find(name) != samplers_symbols.end()){ + if (samplers_symbols.find(name) != samplers_symbols.end()) { output += samplers_symbols[name]; } } - if (samplers_symbols.find(input) != samplers_symbols.end()){ + if (samplers_symbols.find(input) != samplers_symbols.end()) { output += samplers_symbols[input]; } return output; diff --git a/common/sampling.cpp b/common/sampling.cpp index 57ead6607ea03..7761ee94abbd0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -101,9 +101,9 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; - if (params.mirostat == 0){ - for (auto s : params.samplers_sequence){ - switch (s){ + if (params.mirostat == 0) { + for (auto s : params.samplers_sequence) { + switch (s) { case 'k': result += "-> top_k "; break; case 'f': result += "-> tfs_z "; break; case 'y': result += "-> typical_p "; break; @@ -126,15 +126,15 @@ void sampler_queue( size_t & min_keep) { const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const std::string samplers_sequence = params.samplers_sequence; + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float min_p = params.min_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const std::string & samplers_sequence = params.samplers_sequence; - for (auto s : samplers_sequence){ + for (auto s : samplers_sequence) { switch (s){ case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; From 0b87ef4faef0507852ffbd71c1ca09c09b9afd1a Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Tue, 5 Dec 2023 14:59:54 +0500 Subject: [PATCH 11/11] Fixing whitespaces --- common/common.cpp | 2 +- common/sampling.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b184fea099e05..8e6d74d0d704a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -931,7 +931,7 @@ std::string parse_samplers_input(std::string input) { std::string name = input.substr(0,separator); input = input.substr(separator+1); separator = input.find(';'); - + if (samplers_symbols.find(name) != samplers_symbols.end()) { output += samplers_symbols[name]; } diff --git a/common/sampling.cpp b/common/sampling.cpp index 7761ee94abbd0..b6bb886c6c7d7 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -133,7 +133,7 @@ void sampler_queue( const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const std::string & samplers_sequence = params.samplers_sequence; - + for (auto s : samplers_sequence) { switch (s){ case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;