diff --git a/ci/run.sh b/ci/run.sh index e067782193b9b..dc26d94eed1fd 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -53,7 +53,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi if [ ! -z ${GG_BUILD_VULKAN} ]; then diff --git a/common/arg.cpp b/common/arg.cpp index cd9d315dc78ff..58cab702039e7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -128,13 +128,13 @@ static void common_params_handle_model_default(common_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); } } else if (!params.model_url.empty()) { if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + params.model = fs_get_cache_file(string_split(f, '/').back()); } } else if (params.model.empty()) { params.model = DEFAULT_MODEL_PATH; @@ -251,6 +251,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & antiprompt : params.antiprompt) { string_process_escapes(antiprompt); } + for (auto & seq_breaker : params.sparams.dry_sequence_breakers) { + string_process_escapes(seq_breaker); + } } if (!params.kv_overrides.empty()) { @@ -879,7 +882,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--samplers"}, "SAMPLERS", string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](common_params & params, const std::string & value) { - const auto sampler_names = string_split(value, ';'); + const auto sampler_names = string_split(value, ';'); params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); @@ -919,6 +922,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sparams.temp = std::max(params.sparams.temp, 0.0f); } ).set_sparam()); + add_opt(common_arg( + {"--k-shift"}, "N", + string_format("k-shift sampling (default: %d, 0 = disabled)", params.sparams.k_shift), + [](common_params & params, int value) { + params.sparams.k_shift = value; + } + ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), @@ -997,6 +1007,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); + add_opt(common_arg( + {"--dry-multiplier"}, "N", + string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier), + [](common_params & params, const std::string & value) { + params.sparams.dry_multiplier = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-base"}, "N", + string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base), + [](common_params & params, const std::string & value) { + float potential_base = std::stof(value); + if (potential_base >= 1.0f) + { + params.sparams.dry_base = potential_base; + } + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-allowed-length"}, "N", + string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length), + [](common_params & params, int value) { + params.sparams.dry_allowed_length = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-penalty-last-n"}, "N", + string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n), + [](common_params & params, int value) { + params.sparams.dry_penalty_last_n = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-sequence-breaker"}, "STRING", + string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", + params.sparams.dry_sequence_breakers.empty() ? "none" : + std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()), + params.sparams.dry_sequence_breakers.end(), + std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'", + [](const std::string& a, const std::string& b) { + std::string formatted_b = (b == "\n") ? "\\n" : b; + return a + ", '" + formatted_b + "'"; + }).c_str()), + [](common_params & params, const std::string & value) { + static bool defaults_cleared = false; + + if (!defaults_cleared) { + params.sparams.dry_sequence_breakers.clear(); + defaults_cleared = true; + } + + if (value == "none") { + params.sparams.dry_sequence_breakers.clear(); + } else { + params.sparams.dry_sequence_breakers.emplace_back(value); + } + } + ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), diff --git a/common/common.cpp b/common/common.cpp index a8eebb68b5351..490c089de21cf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -416,19 +416,6 @@ std::string string_format(const char * fmt, ...) { return std::string(buf.data(), size); } -std::vector string_split(std::string input, char separator) { - std::vector parts; - size_t separator_pos = input.find(separator); - while (separator_pos != std::string::npos) { - std::string part = input.substr(0, separator_pos); - parts.emplace_back(part); - input = input.substr(separator_pos + 1); - separator_pos = input.find(separator); - } - parts.emplace_back(input); - return parts; -} - std::string string_strip(const std::string & str) { size_t start = 0; size_t end = str.size(); @@ -2019,6 +2006,10 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length); + fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base); + fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier); + fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); @@ -2101,6 +2092,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "k_shift: %d # default: 0\n", sparams.k_shift); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index 19d928777ccd5..97e35fb4cf4dd 100644 --- a/common/common.h +++ b/common/common.h @@ -84,14 +84,16 @@ enum llama_example { enum common_sampler_type { COMMON_SAMPLER_TYPE_NONE = 0, - COMMON_SAMPLER_TYPE_TOP_K = 1, - COMMON_SAMPLER_TYPE_TOP_P = 2, - COMMON_SAMPLER_TYPE_MIN_P = 3, - COMMON_SAMPLER_TYPE_TFS_Z = 4, - COMMON_SAMPLER_TYPE_TYPICAL_P = 5, - COMMON_SAMPLER_TYPE_TEMPERATURE = 6, - COMMON_SAMPLER_TYPE_XTC = 7, - COMMON_SAMPLER_TYPE_INFILL = 8, + COMMON_SAMPLER_TYPE_DRY = 1, + COMMON_SAMPLER_TYPE_K_SHIFT = 2, + COMMON_SAMPLER_TYPE_TOP_K = 3, + COMMON_SAMPLER_TYPE_TOP_P = 4, + COMMON_SAMPLER_TYPE_MIN_P = 5, + COMMON_SAMPLER_TYPE_TFS_Z = 6, + COMMON_SAMPLER_TYPE_TYPICAL_P = 7, + COMMON_SAMPLER_TYPE_TEMPERATURE = 8, + COMMON_SAMPLER_TYPE_XTC = 9, + COMMON_SAMPLER_TYPE_INFILL = 10, }; // dimensionality reduction methods, used by cvector-generator @@ -104,32 +106,41 @@ enum dimre_method { struct common_sampler_params { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float tfs_z = 1.00f; // 1.0 = disabled - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t k_shift = 0; // 0 = disabled + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float tfs_z = 1.00f; // 1.0 = disabled + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + + std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY std::vector samplers = { + COMMON_SAMPLER_TYPE_DRY, + COMMON_SAMPLER_TYPE_K_SHIFT, COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TFS_Z, COMMON_SAMPLER_TYPE_TYPICAL_P, @@ -380,8 +391,6 @@ bool set_process_priority(enum ggml_sched_priority prio); LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) std::string string_format(const char * fmt, ...); -std::vector string_split(std::string input, char separator); - std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); @@ -389,6 +398,7 @@ void string_replace_all(std::string & s, const std::string & search, const std:: template static std::vector string_split(const std::string & str, char delim) { + static_assert(!std::is_same::value, "Please use the specialized version for std::string"); std::vector values; std::istringstream str_stream(str); std::string token; @@ -401,6 +411,22 @@ static std::vector string_split(const std::string & str, char delim) { return values; } +template<> +std::vector string_split(const std::string & input, char separator) +{ + std::vector parts; + size_t begin_pos = 0; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(begin_pos, separator_pos - begin_pos); + parts.emplace_back(part); + begin_pos = separator_pos + 1; + separator_pos = input.find(separator, begin_pos); + } + parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos)); + return parts; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); diff --git a/common/sampling.cpp b/common/sampling.cpp index 4ab3eface3384..82827a616d908 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -130,10 +130,12 @@ std::string common_sampler_params::print() const { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" + "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" + "\tk_shift = %d, top_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, + dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, + k_shift, top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); @@ -174,6 +176,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { + case COMMON_SAMPLER_TYPE_DRY: + { + std::vector c_breakers; + c_breakers.reserve(params.dry_sequence_breakers.size()); + for (const auto& str : params.dry_sequence_breakers) { + c_breakers.push_back(str.c_str()); + } + + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + } + break; + case COMMON_SAMPLER_TYPE_K_SHIFT: + llama_sampler_chain_add(result->chain, llama_sampler_init_k_shift (params.k_shift)); + break; case COMMON_SAMPLER_TYPE_TOP_K: llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); break; @@ -358,6 +374,8 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { switch (cnstr) { + case COMMON_SAMPLER_TYPE_DRY: return 'd'; + case COMMON_SAMPLER_TYPE_K_SHIFT: return 's'; case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; case COMMON_SAMPLER_TYPE_TFS_Z: return 'f'; case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; @@ -372,6 +390,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { switch (cnstr) { + case COMMON_SAMPLER_TYPE_DRY: return "dry"; + case COMMON_SAMPLER_TYPE_K_SHIFT: return "k_shift"; case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z"; case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; @@ -386,7 +406,9 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { std::unordered_map sampler_canonical_name_map { + { "dry", COMMON_SAMPLER_TYPE_DRY }, { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, + { "k_shift", COMMON_SAMPLER_TYPE_K_SHIFT }, { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, @@ -400,6 +422,7 @@ std::vector common_sampler_types_from_names(const std::vect // make it ready for both system names and input names std::unordered_map sampler_alt_name_map { { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, + { "k-shift", COMMON_SAMPLER_TYPE_K_SHIFT }, { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, @@ -434,6 +457,8 @@ std::vector common_sampler_types_from_names(const std::vect std::vector common_sampler_types_from_chars(const std::string & chars) { std::unordered_map sampler_name_map = { + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_K_SHIFT), COMMON_SAMPLER_TYPE_K_SHIFT }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7e552a71b5c7c..a34dabe235a34 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -573,6 +573,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" + if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 022354a3b624e..28cd02e5a7f66 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -72,6 +72,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, diff --git a/examples/main/README.md b/examples/main/README.md index 7e192b9f2837c..2c7d45e0c7158 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -187,6 +187,38 @@ Use the `--no-penalize-nl` option to disable newline penalization when applying Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl` +### DRY Repetition Penalty + +DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). + +- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled). +- `--dry-base N`: Set the DRY sampling base value (default: 1.75). +- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2). +- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size). +- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used. + +The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8. + +The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions. + +The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words. + +The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens. + +The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied. + +DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence. + +Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"` + +### K-Shift Sampling + +- `--k-shift N`: Shift the first token selection by cutting out N tokens from the top once (default: 0). + +K-Shift is a sampling method that guides models away from the most obvious output, eliciting reasoning and analysis. It cuts out k top tokens once at the beginning of inference, making sure that the dialog will start from a less obvious path without guiding the model too much. The method was mentoned in a paper [Chain-of-Thought Reasoning without Prompting](https://arxiv.org/pdf/2402.10200) as a simple trick to guiding a model towards reasoning. In practice, K-Shift can improve the quality of reasoning, help bypass bias or censorship in certain cases, and may also be used as a diagnostics tool. K-Shift is intended to be used with greedy sampling (`--k-shift 10 --top-k 1`), but can help with creative writing too - albeit, not as much as XTC. The default value is 0. + +Example usage: `--k-shift 10` + ### Top-K Sampling - `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40). diff --git a/examples/server/README.md b/examples/server/README.md index 09f1aa249ab1f..bc737237eb018 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -114,6 +114,11 @@ The project is under active development, and we are [looking for feedback and co | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | +| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--dry-base N` | DRY sampling base value (default: 1.75) | +| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) | +| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | +| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | @@ -319,6 +324,18 @@ node index.js - The prompt is a string or an array with the first element given as a string - The model's `tokenizer.ggml.add_bos_token` metadata is `true` + These input shapes and data type are allowed for `prompt`: + + - Single string: `"string"` + - Single sequence of tokens: `[12, 34, 56]` + - Mixed tokens and strings: `[12, 34, "string", 56, 78]` + + Multiple prompts are also supported. In this case, the completion result will be an array. + + - Only strings: `["string1", "string2"]` + - Strings and sequences of tokens: `["string1", [12, 34, 56]]` + - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]` + `temperature`: Adjust the randomness of the generated text. Default: `0.8` `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled. @@ -357,6 +374,16 @@ node index.js `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. + `dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. + + `dry_base`: Set the DRY repetition penalty base value. Default: `1.75` + + `dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` + + `dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size. + + `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index ad4183cd928f7..f7a38bfea7cf0 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -40,7 +40,12 @@ repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_penalty: 1.0, // 1.0 = disabled penalize_nl: false, // true only useful for infinite completion - top_k: 0, // <= 0 to use vocab size + dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well + dry_base: 1.75, // 0.0 = disabled + dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well + dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + k_shift: 0, // <= 0 to use vocab size + top_k: 0, // 0 = disabled top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 xtc_probability: 0.0, // 0 = disabled; @@ -831,15 +836,20 @@
Further Options
+ ${IntField({ label: "K-Shift", title: "Cuts out first k tokens once at the start of sampling. Intended to use with greedy sampling.", max: 100, min: 0, step: 1, name: "k_shift", value: params.value.k_shift })} ${IntField({ label: "Top-K", title: "Limits the selection of the next token to the K most probable tokens. 1 means no randomness = greedy sampling. If set to 0, it means the entire vocabulary size is considered.", max: 100, min: 0, step: 1, name: "top_k", value: params.value.top_k })} ${IntField({ label: "Penalize Last N", title: "The last n tokens that are taken into account to penalise repetitions. A value of 0 means that this function is deactivated and -1 means that the entire size of the context is taken into account.", max: 2048, min: 0, step: 16, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} - ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} + ${FloatField({ label: "DRY Penalty Multiplier", title: "Set the DRY repetition penalty multiplier. Default is 0.0, which disables DRY.", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })} + ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })} + ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })} + ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })} + ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
@@ -1144,6 +1154,8 @@

llama.cpp

repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 }, presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 }, frequency_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 }, + dry_multiplier: { snapValue: 0.0, snapRangeMultiplier: 4 }, + dry_base: { snapValue: 1.75, snapRangeMultiplier: 4 }, }; // add an event listener for each slider Object.keys(snapSettings).forEach(sliderName => { diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 88065705fb669..5d391f11afa92 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -304,6 +304,11 @@ repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled penalize_nl: false, + dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well + dry_base: 1.75, // 0.0 = disabled + dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well + dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + k_shift: 0, // 0 = disabled top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled @@ -1004,6 +1009,7 @@ ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} + ${IntField({ label: "K-shift", max: 100, min: -1, name: "k_shift", value: params.value.k_shift })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} @@ -1015,6 +1021,10 @@ ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })} + ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })} + ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })} + ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })} ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} diff --git a/examples/server/public/style.css b/examples/server/public/style.css old mode 100755 new mode 100644 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 51f30ffeab980..a8fa34b197661 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,21 +43,6 @@ #include #include -#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) - -#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) - -#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) - using json = nlohmann::ordered_json; enum stop_type { @@ -68,6 +53,7 @@ enum stop_type { // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, + SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future SLOT_STATE_PROCESSING_PROMPT, SLOT_STATE_DONE_PROMPT, SLOT_STATE_GENERATING, @@ -79,7 +65,7 @@ enum server_state { }; enum server_task_type { - SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_INFERENCE, SERVER_TASK_TYPE_CANCEL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, @@ -89,21 +75,22 @@ enum server_task_type { SERVER_TASK_TYPE_SET_LORA, }; -enum server_task_cmpl_type { - SERVER_TASK_CMPL_TYPE_NORMAL, - SERVER_TASK_CMPL_TYPE_EMBEDDING, - SERVER_TASK_CMPL_TYPE_RERANK, - SERVER_TASK_CMPL_TYPE_INFILL, +enum server_task_inf_type { + SERVER_TASK_INF_TYPE_COMPLETION, + SERVER_TASK_INF_TYPE_EMBEDDING, + SERVER_TASK_INF_TYPE_RERANK, + SERVER_TASK_INF_TYPE_INFILL, }; struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL + llama_tokens prompt_tokens; server_task_type type; json data; - server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; // utility function static std::unordered_set get_list_id(const std::vector & tasks) { @@ -161,26 +148,20 @@ struct server_slot { int32_t i_batch = -1; int32_t n_predict = -1; // TODO: disambiguate from params.n_predict + // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated int32_t n_prompt_tokens = 0; int32_t n_prompt_tokens_processed = 0; - json prompt; // can be either a string, array of strings or array of token ids - - json input_prefix; - json input_suffix; - json input_extra; - - // when a task is submitted, we first tokenize the prompt and store it here - std::vector prompt_tokens; - std::vector extra_tokens; + // input prompt tokens + llama_tokens prompt_tokens; size_t last_nl_pos = 0; std::string generated_text; - std::vector cache_tokens; + llama_tokens cache_tokens; std::vector generated_token_probs; - server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; bool has_next_token = true; bool has_new_line = false; @@ -229,7 +210,7 @@ struct server_slot { n_past = 0; n_sent_text = 0; n_sent_token_probs = 0; - cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + inf_type = SERVER_TASK_INF_TYPE_COMPLETION; generated_token_probs.clear(); } @@ -734,42 +715,6 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto & p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - - std::vector p; - if (first) { - p = common_tokenize(ctx, s, add_special, parse_special); - first = false; - } else { - p = common_tokenize(ctx, s, false, parse_special); - } - - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); - } - - return prompt_tokens; - } - server_slot * get_slot_by_id(int id) { for (server_slot & slot : slots) { if (slot.id == id) { @@ -794,22 +739,16 @@ struct server_context { continue; } - // skip the slot if it does not contains prompt - if (!slot.prompt.is_string()) { + // skip the slot if it does not contains cached tokens + if (slot.prompt_tokens.empty()) { continue; } - // current slot's prompt - std::string slot_prompt = slot.prompt.get(); - - // length of the current slot's prompt - int slot_prompt_len = slot_prompt.size(); - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = longest_common_prefix(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens); // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / slot_prompt_len; + similarity = static_cast(lcp_len) / static_cast(slot.prompt_tokens.size()); // select the current slot if the criteria match if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { @@ -861,35 +800,59 @@ struct server_context { slot.oaicompat_model = ""; } - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); - slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); - slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); - slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); - slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); - slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); - slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); - slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); - slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); - slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep); - slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); - slot.sparams.seed = json_value(data, "seed", default_sparams.seed); - slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); - slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); - //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement - slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); + slot.params.stream = json_value(data, "stream", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); + slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); + slot.sparams.k_shift = json_value(data, "k_shift", default_sparams.k_shift); + slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); + slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); + slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); + slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); + slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); + slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); + slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier); + slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base); + slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length); + slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n); + slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); + slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep); + slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); + slot.sparams.seed = json_value(data, "seed", default_sparams.seed); + slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); + + if (slot.sparams.dry_base < 1.0f) + { + slot.sparams.dry_base = default_sparams.dry_base; + } + + // sequence breakers for DRY + { + // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format + // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 + + if (data.contains("dry_sequence_breakers")) { + slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (slot.sparams.dry_sequence_breakers.empty()) { + send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); + return false; + } + } + } // process "json_schema" and "grammar" if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { @@ -914,57 +877,6 @@ struct server_context { SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } - // infill - slot.input_prefix = json_value(data, "input_prefix", json()); - slot.input_suffix = json_value(data, "input_suffix", json()); - slot.input_extra = json_value(data, "input_extra", json()); - - SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); - for (const auto & chunk : slot.input_extra) { - // { "text": string, "filename": string } - if (!chunk.contains("text") || !chunk["text"].is_string()) { - send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - // filename is optional - if (chunk.contains("filename") && !chunk["filename"].is_string()) { - send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); - } - - // get prompt - { - const auto & prompt = data.find("prompt"); - if (prompt == data.end()) { - send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - if ((prompt->is_string()) || - (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || - (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { - slot.prompt = *prompt; - } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) { - slot.prompt = prompt->at(0); - } else if (prompt->is_array() && prompt->size() > 1) { - // array of strings - for (const auto & el : *prompt) { - if (!el.is_string()) { - send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - slot.prompt = *prompt; - } else { - send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - { slot.sparams.logit_bias.clear(); @@ -1044,8 +956,7 @@ struct server_context { } } - slot.state = SLOT_STATE_PROCESSING_PROMPT; - slot.prompt_tokens.clear(); + slot.state = SLOT_STATE_STARTED; SLT_INF(slot, "%s", "processing task\n"); @@ -1234,6 +1145,7 @@ struct server_context { {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, + {"k_shift", slot.sparams.k_shift}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, @@ -1245,6 +1157,11 @@ struct server_context { {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, + {"dry_multiplier", slot.sparams.dry_multiplier}, + {"dry_base", slot.sparams.dry_base}, + {"dry_allowed_length", slot.sparams.dry_allowed_length}, + {"dry_penalty_last_n", slot.sparams.dry_penalty_last_n}, + {"dry_sequence_breakers", slot.sparams.dry_sequence_breakers}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1297,7 +1214,7 @@ struct server_context { }; if (slot.sparams.n_probs > 0) { - const std::vector to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); + const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1333,7 +1250,7 @@ struct server_context { {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, + {"prompt", common_detokenize(ctx, slot.prompt_tokens)}, {"has_new_line", slot.has_new_line}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, @@ -1348,7 +1265,7 @@ struct server_context { if (slot.sparams.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( @@ -1457,19 +1374,17 @@ struct server_context { // Functions to create new task(s) and receive result(s) // - std::vector create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) { + // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s) + std::vector create_tasks_inference(json data, server_task_inf_type inf_type) { std::vector tasks; - auto create_task = [&](json & task_data, bool replace_prompt, json prompt) { + auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { + SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); server_task task; - task.id = queue_tasks.get_new_id(); - task.cmpl_type = cmpl_type; - task.type = SERVER_TASK_TYPE_COMPLETION; - if (replace_prompt) { - task.data = task_data; - task.data["prompt"] = std::move(prompt); - } else { - task.data = std::move(task_data); - } + task.id = queue_tasks.get_new_id(); + task.inf_type = inf_type; + task.type = SERVER_TASK_TYPE_INFERENCE; + task.data = task_data; + task.prompt_tokens = std::move(prompt_tokens); tasks.push_back(std::move(task)); }; @@ -1478,41 +1393,49 @@ struct server_context { throw std::runtime_error(error_msg); } - json prompt = data.at("prompt"); - - // if the prompt is a singleton (i.e. a string or a list of tokens), we only need to create single task - if (prompt.is_string() || json_is_array_of_numbers(prompt)) { - data["index"] = 0; - create_task(data, false, nullptr); - } else if (prompt.is_array()) { - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - std::vector prompts = prompt; - if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // prompts[0] is the question - // the rest are the answers/documents - SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1); - for (size_t i = 1; i < prompts.size(); i++) { - json qd; - qd.push_back(prompts[0]); - qd.push_back(prompts[i]); - data["index"] = i - 1; - create_task(data, true, qd); - } - } else { - SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size()); - for (size_t i = 0; i < prompts.size(); i++) { - const auto & e = prompts[i]; - if (e.is_string() || json_is_array_of_numbers(e)) { + // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread + bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL; + std::vector tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true); + switch (inf_type) { + case SERVER_TASK_INF_TYPE_RERANK: + { + // prompts[0] is the question + // the rest are the answers/documents + GGML_ASSERT(tokenized_prompts.size() > 1); + SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) tokenized_prompts.size() - 1); + for (size_t i = 1; i < tokenized_prompts.size(); i++) { + data["index"] = i - 1; + auto tokens = format_rerank(model, tokenized_prompts[0], tokenized_prompts[i]); + create_task(data, tokens); + } + } break; + case SERVER_TASK_INF_TYPE_INFILL: + { + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { data["index"] = i; - create_task(data, true, e); - } else { - throw std::runtime_error(error_msg); + auto tokens = format_infill( + ctx, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + params.n_batch, + params.n_predict, + slots[0].n_ctx, // TODO: there should be a better way + params.spm_infill, + tokenized_prompts[i] + ); + create_task(data, tokens); + } + } break; + default: + { + SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + data["index"] = i; + create_task(data, tokenized_prompts[i]); } } - } - } else { - // invalid case - throw std::runtime_error(error_msg); } return tasks; @@ -1534,7 +1457,7 @@ struct server_context { queue_tasks.post(cancel_tasks, true); } - // receive the results from task(s) created by create_tasks_cmpl + // receive the results from task(s) created by create_tasks_inference void receive_cmpl_results( const std::unordered_set & id_tasks, const std::function&)> & result_handler, @@ -1558,7 +1481,7 @@ struct server_context { result_handler(results); } - // receive the results from task(s) created by create_tasks_cmpl, in stream mode + // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const std::function & result_handler, const @@ -1591,7 +1514,7 @@ struct server_context { void process_single_task(const server_task & task) { switch (task.type) { - case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFERENCE: { const int id_slot = json_value(task.data, "id_slot", -1); @@ -1623,9 +1546,10 @@ struct server_context { slot->reset(); - slot->id_task = task.id; - slot->cmpl_type = task.cmpl_type; - slot->index = json_value(task.data, "index", 0); + slot->id_task = task.id; + slot->inf_type = task.inf_type; + slot->index = json_value(task.data, "index", 0); + slot->prompt_tokens = std::move(task.prompt_tokens); if (!launch_slot_with_task(*slot, task)) { SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); @@ -1658,7 +1582,7 @@ struct server_context { slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; + slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, {"has_new_line", slot.has_new_line}, @@ -1785,9 +1709,6 @@ struct server_context { } slot->cache_tokens.resize(token_count); - // TODO: maybe detokenize the slot->cache_tokens instead? - slot->prompt = string_format("[restored %d tokens from file]", (int) token_count); - const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; @@ -1954,142 +1875,18 @@ struct server_context { if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_PROCESSING_PROMPT) { + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { auto & prompt_tokens = slot.prompt_tokens; - // we haven't tokenized the prompt yet - do it now: - if (prompt_tokens.empty()) { - SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size()); - + // TODO: maybe move branch to outside of this loop in the future + if (slot.state == SLOT_STATE_STARTED) { slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; - - switch (slot.cmpl_type) { - case SERVER_TASK_CMPL_TYPE_NORMAL: - case SERVER_TASK_CMPL_TYPE_EMBEDDING: - { - prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true); - } break; - case SERVER_TASK_CMPL_TYPE_RERANK: - { - // require slot.prompt to be array of 2 strings - if (!slot.prompt.is_array() || slot.prompt.size() != 2) { - SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); - slot.release(); - send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); - continue; - } - - // prompt: [BOS]query[EOS][SEP]doc[EOS] - prompt_tokens.clear(); - prompt_tokens.push_back(llama_token_bos(model)); - { - const auto part = tokenize(slot.prompt[0], false, false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - prompt_tokens.push_back(llama_token_sep(model)); - { - const auto part = tokenize(slot.prompt[1], false, false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - } break; - case SERVER_TASK_CMPL_TYPE_INFILL: - { - // TODO: optimize this block by reducing memory allocations and movement - - // use FIM repo-level pattern: - // ref: https://arxiv.org/pdf/2409.12186 - // - // [FIM_REP]myproject - // [FIM_SEP]filename0 - // extra chunk 0 - // [FIM_SEP]filename1 - // extra chunk 1 - // ... - // [FIM_SEP]filename - // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt - // - auto tokens_prefix = tokenize(slot.input_prefix, false, false); - auto tokens_suffix = tokenize(slot.input_suffix, false, false); - auto tokens_prompt = tokenize(slot.prompt, false, false); - - slot.extra_tokens.clear(); - if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { - static const auto k_fim_repo = tokenize("myproject\n", false, false); - - slot.extra_tokens.push_back(llama_token_fim_rep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); - } - - for (const auto & chunk : slot.input_extra) { - // { "text": string, "filename": string } - const std::string text = chunk.value("text", ""); - const std::string filename = chunk.value("filename", "tmp"); - - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { - const auto k_fim_file = tokenize(filename + "\n", false, false); - - slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } else { - // chunk separator in binary form to avoid confusing the AI - static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; - static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); - - slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); - } - - const auto chunk_tokens = tokenize(text, false, false); - slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); - } - - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { - // TODO: current filename - static const auto k_fim_file = tokenize("filename\n", false, false); - - slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } - - // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); - const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); - - // fill the rest of the context with extra chunks - const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); - - tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); - tokens_suffix.resize(n_suffix_take); - - tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); - tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); - tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); - - auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; - auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; - - if (llama_add_bos_token(model)) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - - SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); - - // put the extra context before the FIM prefix - embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); - - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - embd_inp.push_back(llama_token_fim_mid(model)); - - prompt_tokens = std::move(embd_inp); - } break; - } - slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); + slot.state = SLOT_STATE_PROCESSING_PROMPT; - SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); // print prompt tokens (for debugging) if (1) { @@ -2114,13 +1911,18 @@ struct server_context { continue; } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // this prompt is too large to process - discard it + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { if (slot.n_prompt_tokens > n_ubatch) { slot.release(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; } + + if (slot.n_prompt_tokens > slot.n_ctx) { + slot.release(); + send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_SERVER); + continue; + } } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size @@ -2144,7 +1946,7 @@ struct server_context { const int n_block_size = n_left / 2; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - std::vector new_tokens( + llama_tokens new_tokens( prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); @@ -2198,7 +2000,6 @@ struct server_context { for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - slot.n_past++; } @@ -2225,7 +2026,7 @@ struct server_context { } // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; @@ -2234,8 +2035,8 @@ struct server_context { // check that we are in the right batch_type, if not defer the slot const bool slot_type = - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; + slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || + slot.inf_type == SERVER_TASK_INF_TYPE_RERANK ? 1 : 0; if (batch_type == -1) { batch_type = slot_type; @@ -2353,7 +2154,7 @@ struct server_context { } if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING) { // prompt evaluated for embedding send_embedding(slot, batch_view); slot.release(); @@ -2361,7 +2162,7 @@ struct server_context { continue; // continue loop of slots } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { send_rerank(slot, batch_view); slot.release(); slot.i_batch = -1; @@ -2608,7 +2409,7 @@ int main(int argc, char ** argv) { auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) { server_state current_state = state.load(); if (current_state == SERVER_STATE_LOADING_MODEL) { - auto tmp = string_split(req.path, '.'); + auto tmp = string_split(req.path, '.'); if (req.path == "/" || tmp.back() == "html") { res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); res.status = 503; @@ -2915,13 +2716,13 @@ int main(int argc, char ** argv) { res_ok(res, {{ "success", true }}); }; - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { if (ctx_server.params.embedding || ctx_server.params.reranking) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); return; } - std::vector tasks = ctx_server.create_tasks_cmpl(data, cmpl_type); + std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2967,10 +2768,11 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + // check model compatibility std::string err; if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { err += "prefix token is missing. "; @@ -2981,14 +2783,42 @@ int main(int argc, char ** argv) { if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { err += "middle token is missing. "; } - if (!err.empty()) { res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); return; } json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); + + // validate input + if (!data.contains("input_prefix")) { + res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_suffix")) { + res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); + return; + } + json input_extra = json_value(data, "input_extra", json::array()); + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk.at("text").is_string()) { + res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); + return; + } + // filename is optional + if (chunk.contains("filename") && !chunk.at("filename").is_string()) { + res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + data["input_extra"] = input_extra; // default to empty array if it's not exist + + return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; // TODO: maybe merge this function with "handle_completions_generic" @@ -3000,7 +2830,7 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - std::vector tasks = ctx_server.create_tasks_cmpl(data, SERVER_TASK_CMPL_TYPE_NORMAL); + std::vector tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3073,7 +2903,7 @@ int main(int argc, char ** argv) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { @@ -3110,7 +2940,7 @@ int main(int argc, char ** argv) { std::string content; if (body.count("tokens") != 0) { - const std::vector tokens = body.at("tokens"); + const llama_tokens tokens = body.at("tokens"); content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); } @@ -3144,7 +2974,7 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_EMBEDDING); + std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_EMBEDDING); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3221,7 +3051,7 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK); + std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_RERANK); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature new file mode 100644 index 0000000000000..a0bbfef77707b --- /dev/null +++ b/examples/server/tests/features/infill.feature @@ -0,0 +1,36 @@ +@llama.cpp +@infill +Feature: llama.cpp server + + # The current model is made by adding FIM tokens to the existing stories260K + # We may want to use a better model in the future, maybe something like SmolLM 360M + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models + And a model file test-model-infill.gguf + And a model alias tinyllama-infill + And 42 as server seed + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And 64 max tokens to predict + And 0.0 temperature + Then the server is starting + Then the server is healthy + + Scenario: Infill without input_extra + Given a prompt "Complete this" + And an infill input extra none none + And an infill input prefix "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_" + And an infill input suffix "}\n" + And an infill request with no api error + Then 64 tokens are predicted matching One|day|she|saw|big|scary|bird + + Scenario: Infill with input_extra + Given a prompt "Complete this" + And an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n" + And an infill input prefix "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_" + And an infill input suffix "}\n" + And an infill request with no api error + Then 64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room" diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 540a2ecd56374..2e418d8aa571b 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -80,6 +80,11 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.lora_file = None context.disable_ctx_shift = False + # infill + context.infill_input_extra = None + context.infill_input_suffix = '' + context.infill_input_prefix = '' + context.tasks_result = [] context.concurrent_tasks = [] context.prompts = [] @@ -291,6 +296,28 @@ async def step_request_completion(context, api_error: Literal['raised'] | str): assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}" +@step('an infill request with {api_error} api error') +@async_run_until_complete +async def step_request_completion(context, api_error: Literal['raised'] | str): + if api_error != 'no': + raise ValueError(f'api_error={api_error} is not yet implemented') + payload = { + "prompt": context.prompts[0], + "input_suffix": context.infill_input_suffix, + "input_prefix": context.infill_input_prefix, + "n_predict": context.n_predict, + "seed": context.seed, + "temperature": context.temperature, + } + if context.infill_input_extra is not None: + payload['input_extra'] = context.infill_input_extra + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: + async with session.post(f'{context.base_url}/infill', + json=payload) as response: + assert response.status == 200 + context.tasks_result = [await response.json()] + + @step('{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): context.completion = context.tasks_result.pop() @@ -539,6 +566,25 @@ def step_a_prompt_prompt(context, prompt): context.n_prompts = len(context.prompts) +# TODO: allow this to be repeated +@step('an infill input extra {filename} {text}') +def step_infill_input_extra(context, filename, text): + if filename == 'none': + context.infill_input_extra = None + else: + context.infill_input_extra = [{'filename': filename, 'text': text}] + + +@step('an infill input suffix {text}') +def step_infill_input_suffix(context, text): + context.infill_input_suffix = text + + +@step('an infill input prefix {text}') +def step_infill_input_prefix(context, text): + context.infill_input_prefix = text + + @step('{num_prompts:d} prompts {prompt} with seed {seed:d}') def step_many_prompts(context, num_prompts, prompt, seed): if context.seed is None: diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 69519ef95b2d9..8112420624185 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -24,6 +24,22 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; +using llama_tokens = std::vector; + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { @@ -52,9 +68,235 @@ static T json_value(const json & body, const std::string & key, const T & defaul } // -// chat template utils +// tokenizer and input processing utils // +static bool json_is_array_of_numbers(const json & data) { + if (data.is_array()) { + for (const auto & e : data) { + if (!e.is_number_integer()) { + return false; + } + } + return true; + } + return false; +} + +// is array having BOTH numbers & strings? +static bool json_is_array_of_mixed_numbers_strings(const json & data) { + bool seen_string = false; + bool seen_number = false; + if (data.is_array()) { + for (const auto & e : data) { + seen_string |= e.is_string(); + seen_number |= e.is_number_integer(); + if (seen_number && seen_string) { + return true; + } + } + } + return false; +} + +/** + * this handles 2 cases: + * - only string, example: "string" + * - mixed string and tokens, example: [12, 34, "string", 56, 78] + */ +static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + llama_tokens prompt_tokens; + + if (json_prompt.is_array()) { + bool first = true; + for (const auto & p : json_prompt) { + if (p.is_string()) { + auto s = p.template get(); + + llama_tokens p; + if (first) { + p = common_tokenize(ctx, s, add_special, parse_special); + first = false; + } else { + p = common_tokenize(ctx, s, false, parse_special); + } + + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } else { + if (first) { + first = false; + } + + prompt_tokens.push_back(p.template get()); + } + } + } else { + auto s = json_prompt.template get(); + prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); + } + + return prompt_tokens; +} + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * and multiple prompts (multi-tasks): + * - "prompt": ["string1", "string2"] + * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] + */ +static std::vector tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { + std::vector result; + if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { + // string or mixed + result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special)); + } else if (json_is_array_of_numbers(json_prompt)) { + // array of tokens + result.push_back(json_prompt.get()); + } else if (json_prompt.is_array()) { + // array of prompts + result.reserve(json_prompt.size()); + for (const auto & p : json_prompt) { + if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { + result.push_back(tokenize_mixed(ctx, p, add_special, parse_special)); + } else if (json_is_array_of_numbers(p)) { + // array of tokens + result.push_back(p.get()); + } else { + throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens"); + } + } + } else { + throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); + } + return result; +} + +// +// template utils +// + +// format rerank task: [BOS]query[EOS][SEP]doc[EOS] +static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) { + llama_tokens result; + result.reserve(doc.size() + query.size() + 4); + result.push_back(llama_token_bos(model)); + result.insert(result.end(), query.begin(), query.end()); + result.push_back(llama_token_eos(model)); + result.push_back(llama_token_sep(model)); + result.insert(result.end(), doc.begin(), doc.end()); + result.push_back(llama_token_eos(model)); + return result; +} + +// format infill task +static llama_tokens format_infill( + const llama_context * ctx, + const json & input_prefix, + const json & input_suffix, + const json & input_extra, + const int n_batch, + const int n_predict, + const int n_ctx, + const bool spm_infill, + const llama_tokens & tokens_prompt + ) { + // TODO: optimize this block by reducing memory allocations and movement + + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + llama_tokens extra_tokens; + extra_tokens.reserve(n_ctx); + + auto model = llama_get_model(ctx); + auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false); + auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false); + + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + // TODO: make project name an input + static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); + + extra_tokens.push_back(llama_token_fim_rep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + const std::string text = json_value(chunk, "text", std::string()); + const std::string filename = json_value(chunk, "filename", std::string("tmp")); + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); + + extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false); + + extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = common_tokenize(ctx, text, false, false); + extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false); + + extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); + + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size()); + + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); + + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + + auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + return embd_inp; +} + // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { std::vector chat; @@ -229,18 +471,6 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin return std::string::npos; } -static bool json_is_array_of_numbers(const json & data) { - if (data.is_array()) { - for (const auto & e : data) { - if (!e.is_number()) { - return false; - } - } - return true; - } - return false; -} - // TODO: reuse llama_detokenize template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh index 7961674266ee1..28b06cddaa87b 100644 --- a/ggml/src/ggml-cuda/cpy.cuh +++ b/ggml/src/ggml-cuda/cpy.cuh @@ -1,6 +1,6 @@ #include "common.cuh" -#define CUDA_CPY_BLOCK_SIZE 32 +#define CUDA_CPY_BLOCK_SIZE 64 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index e9541441c8f54..80c08f15b2999 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -1015,19 +1015,21 @@ static void ggml_metal_encode_node( id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; - //GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} +#if 0 + GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + if (src0) { + GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, + ggml_is_contiguous(src0), src0->name); + } + if (src1) { + GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13, + ggml_is_contiguous(src1), src1->name); + } + if (dst) { + GGML_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3, + dst->name); + } +#endif id device = ctx_dev->mtl_device; @@ -1810,14 +1812,16 @@ static void ggml_metal_encode_node( [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:7]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:9]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:10]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:11]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:15]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:16]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { @@ -1986,20 +1990,22 @@ static void ggml_metal_encode_node( [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:13]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:14]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:15]; + [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:19]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:20]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -2048,6 +2054,9 @@ static void ggml_metal_encode_node( GGML_ASSERT(src1t == GGML_TYPE_F32); + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel // ne20 = n_used_experts diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index 71b58be1fd8a4..defde6246f129 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -777,10 +777,10 @@ kernel void kernel_ssm_conv_f32( const int64_t i3 = tgpig.z; const int64_t nc = ne10; - const int64_t ncs = ne00; - const int64_t nr = ne01; - const int64_t n_t = ne1; - const int64_t n_s = ne2; + //const int64_t ncs = ne00; + //const int64_t nr = ne01; + //const int64_t n_t = ne1; + //const int64_t n_s = ne2; device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02); device const float * c = (device const float *) ((device const char *) src1 + ir*nb11); @@ -834,9 +834,9 @@ kernel void kernel_ssm_scan_f32( const int64_t i3 = tgpig.y; const int64_t nc = d_state; - const int64_t nr = d_inner; + //const int64_t nr = d_inner; const int64_t n_t = n_seq_tokens; - const int64_t n_s = n_seqs; + //const int64_t n_s = n_seqs; for (int64_t i2 = 0; i2 < n_t; ++i2) { device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02); @@ -1064,17 +1064,18 @@ kernel void kernel_group_norm( inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) { float d = qb_curr->d; - float2 acc = 0.f; + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2); + device const uint16_t * qs = ((device const uint16_t *) qb_curr + 1 + il/2); - for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F) - + yl[i + 1] * (qs[i / 2] & 0x0F00); - acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0) - + yl[i + 9] * (qs[i / 2] & 0xF000); + for (int i = 0; i < 8; i += 2) { + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); + acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); } - return d * (sumy * -8.f + acc[0] + acc[1]); + + return d * (sumy * -8.f + acc[0] + acc[1] + acc[2] + acc[3]); } // function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i]) @@ -1085,17 +1086,18 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre float d = qb_curr->d; float m = qb_curr->m; - float2 acc = 0.f; + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; - device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2); + device const uint16_t * qs = ((device const uint16_t *) qb_curr + 2 + il/2); for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F) - + yl[i + 1] * (qs[i / 2] & 0x0F00); - acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0) - + yl[i + 9] * (qs[i / 2] & 0xF000); + acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F); + acc[1] += yl[i + 1] * (qs[i / 2] & 0x0F00); + acc[2] += yl[i + 8] * (qs[i / 2] & 0x00F0); + acc[3] += yl[i + 9] * (qs[i / 2] & 0xF000); } - return d * (acc[0] + acc[1]) + sumy * m; + + return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; } // function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i]) @@ -1105,18 +1107,19 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) { float d = qb_curr->d; - float2 acc = 0.f; + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; device const uint16_t * qs = ((device const uint16_t *)qb_curr + 3 + il/2); const uint32_t qh = *((device const uint32_t *)qb_curr->qh); for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)) - + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); - acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)) - + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); + acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); + acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); + acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); + acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); } - return d * (sumy * -16.f + acc[0] + acc[1]); + + return d * (sumy * -16.f + acc[0] + acc[1] + acc[2] + acc[3]); } // function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i]) @@ -1127,18 +1130,19 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre float d = qb_curr->d; float m = qb_curr->m; - float2 acc = 0.f; + float acc[4] = { 0.0f, 0.0f, 0.0f, 0.0f }; device const uint16_t * qs = ((device const uint16_t *)qb_curr + 4 + il/2); const uint32_t qh = *((device const uint32_t *)qb_curr->qh); for (int i = 0; i < 8; i+=2) { - acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)) - + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); - acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)) - + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); + acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il ) << 4 ) & 0x00010)); + acc[1] += yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il ) << 12) & 0x01000)); + acc[2] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100)); + acc[3] += yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000)); } - return d * (acc[0] + acc[1]) + sumy * m; + + return d * (acc[0] + acc[1] + acc[2] + acc[3]) + sumy * m; } // putting them in the kernel cause a significant performance penalty @@ -1156,14 +1160,22 @@ void mul_vec_q_n_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, uint r3, threadgroup int8_t * shared_values, - uint3 tgpig, uint tiisg, uint sgitg) { + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK4_0; const int r0 = tgpig.x; @@ -1175,10 +1187,19 @@ void mul_vec_q_n_f32_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q_type * x = (device const block_q_type *) src0 + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + //device const block_q_type * x = (device const block_q_type *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); + + // pointers to src0 rows + device const block_q_type * ax[nr]; + for (int row = 0; row < nr; ++row) { + const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + + ax[row] = (device const block_q_type *) ((device char *) src0 + offset0); + } float yl[16]; // src1 vector cache float sumf[nr] = {0.f}; @@ -1190,19 +1211,22 @@ void mul_vec_q_n_f32_impl( // each thread in a SIMD group deals with half a block. for (int ib = ix; ib < nb; ib += nw/2) { - float sumy = 0; + float sumy[2] = { 0.f, 0.f }; + +#pragma unroll for (int i = 0; i < 8; i += 2) { - sumy += yb[i] + yb[i+1]; - yl[i+0] = yb[i+ 0]; - yl[i+1] = yb[i+ 1]/256.f; + sumy[0] += yb[i + 0] + yb[i + 1]; + yl[i + 0] = yb[i + 0]; + yl[i + 1] = yb[i + 1]/256.f; - sumy += yb[i+16] + yb[i+17]; - yl[i+8] = yb[i+16]/16.f; - yl[i+9] = yb[i+17]/4096.f; + sumy[1] += yb[i + 16] + yb[i + 17]; + yl[i + 8] = yb[i + 16]/16.f; + yl[i + 9] = yb[i + 17]/4096.f; } +#pragma unroll for (int row = 0; row < nr; row++) { - sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il); + sumf[row] += block_q_n_dot_y(ax[row] + ib, sumy[0] + sumy[1], yl, il); } yb += QK4_0 * 16; @@ -1226,12 +1250,14 @@ kernel void kernel_mul_mv_q4_0_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1239,7 +1265,7 @@ kernel void kernel_mul_mv_q4_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q4_1_f32( @@ -1252,12 +1278,14 @@ kernel void kernel_mul_mv_q4_1_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1265,7 +1293,7 @@ kernel void kernel_mul_mv_q4_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_0_f32( @@ -1278,12 +1306,14 @@ kernel void kernel_mul_mv_q5_0_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1291,7 +1321,7 @@ kernel void kernel_mul_mv_q5_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_1_f32( @@ -1304,12 +1334,14 @@ kernel void kernel_mul_mv_q5_1_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1317,7 +1349,7 @@ kernel void kernel_mul_mv_q5_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } @@ -1330,8 +1362,14 @@ void kernel_mul_mv_q8_0_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -1354,10 +1392,19 @@ void kernel_mul_mv_q8_0_f32_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + //const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + //device const block_q8_0 * x = (device const block_q8_0 *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); + + // pointers to src0 rows + device const block_q8_0 * ax[nr]; + for (int row = 0; row < nr; ++row) { + const uint offset0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + + ax[row] = (device const block_q8_0 *) ((device char *) src0 + offset0); + } float yl[NB_Q8_0]; float sumf[nr]={0.f}; @@ -1374,12 +1421,12 @@ void kernel_mul_mv_q8_0_f32_impl( } for (int row = 0; row < nr; row++) { - device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il; + device const int8_t * qs = ax[row][ib].qs + NB_Q8_0*il; float sumq = 0.f; for (int iq = 0; iq < NB_Q8_0; ++iq) { sumq += qs[iq] * yl[iq]; } - sumf[row] += sumq*x[ib+row*nb].d; + sumf[row] += sumq*ax[row][ib].d; } yb += NB_Q8_0 * nw; @@ -1404,12 +1451,14 @@ kernel void kernel_mul_mv_q8_0_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1417,7 +1466,7 @@ kernel void kernel_mul_mv_q8_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); + kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } #define N_MV_T_T 4 @@ -1433,12 +1482,14 @@ void kernel_mul_mv_impl( uint64_t nb00, uint64_t nb01, uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne11, int64_t ne12, uint64_t nb10, uint64_t nb11, uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -1452,7 +1503,7 @@ void kernel_mul_mv_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; device const T0 * x = (device const T0 *) (src0 + offset0); @@ -1463,7 +1514,9 @@ void kernel_mul_mv_impl( break; } - device const T1 * y = (device const T1 *) (src1 + r1*nb11 + im*nb12); + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const T1 * y = (device const T1 *) (src1 + offset1); float sumf = 0; for (int i = tiisg; i < ne00; i += 32) { @@ -1483,7 +1536,9 @@ void kernel_mul_mv_impl( break; } - device const T1 * y = (device const T1 *) (src1 + r1*nb11 + im*nb12); + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const T1 * y = (device const T1 *) (src1 + offset1); device const T14 * y4 = (device const T14 *) y; float sumf = 0; @@ -1511,12 +1566,14 @@ kernel void kernel_mul_mv( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1533,12 +1590,14 @@ kernel void kernel_mul_mv( nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, + nb13, ne0, ne1, r2, @@ -1564,12 +1623,14 @@ kernel void kernel_mul_mv_1row( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1584,10 +1645,11 @@ kernel void kernel_mul_mv_1row( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; device const T * x = (device const T *) (src0 + offset0); - device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + device const float * y = (device const float *) (src1 + offset1); float sumf = 0; if (ne00 < 128) { @@ -1631,12 +1693,14 @@ kernel void kernel_mul_mv_l4( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -1651,12 +1715,14 @@ kernel void kernel_mul_mv_l4( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02; + const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; device const T4 * x4 = (device const T4 *) (src0 + offset0); for (int r1 = 0; r1 < nrows; ++r1) { - device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const float4 * y4 = (device const float4 *) (src1 + offset1); float sumf = 0; for (int i = tiisg; i < ne00/4; i += 32) { @@ -3416,8 +3482,14 @@ void kernel_mul_mv_q2_K_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -3433,21 +3505,19 @@ void kernel_mul_mv_q2_K_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_q2_K * x = (device const block_q2_K *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; - const int step = sizeof(block_q2_K) * nb; - const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 const int iq = it/4; // 0 or 1 @@ -3492,9 +3562,9 @@ void kernel_mul_mv_q2_K_f32_impl( (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) - dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0)); - qs += step/2; - sc += step; - dh += step/2; + qs += nb01/2; + sc += nb01; + dh += nb01/2; } y4 += 4 * QK_K; @@ -3519,12 +3589,14 @@ kernel void kernel_mul_mv_q2_K_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3533,7 +3605,7 @@ kernel void kernel_mul_mv_q2_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q3_K_f32_impl( @@ -3543,8 +3615,14 @@ void kernel_mul_mv_q3_K_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -3565,10 +3643,11 @@ void kernel_mul_mv_q3_K_f32_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_q3_K * x = (device const block_q3_K *) ((device char *) src0 + offset0); + device const float * yy = (device const float *) ((device char *) src1 + offset1); float yl[32]; @@ -3608,8 +3687,6 @@ void kernel_mul_mv_q3_K_f32_impl( const int q_offset = 32*ip + l0; const int y_offset = 128*ip + 32*il + l0; - const int step = sizeof(block_q3_K) * nb / 2; - device const float * y1 = yy + ix*QK_K + y_offset; uint32_t scales32, aux32; @@ -3619,7 +3696,6 @@ void kernel_mul_mv_q3_K_f32_impl( float sumf1[2] = {0.f}; float sumf2[2] = {0.f}; for (int i = ix; i < nb; i += 4) { - for (int l = 0; l < 8; ++l) { yl[l+ 0] = y1[l+ 0]; yl[l+ 8] = y1[l+16]; @@ -3633,7 +3709,6 @@ void kernel_mul_mv_q3_K_f32_impl( device const half * dh = &x[i].d; for (int row = 0; row < 2; ++row) { - const float d_all = (float)dh[0]; scales16[0] = a[4]; @@ -3673,15 +3748,13 @@ void kernel_mul_mv_q3_K_f32_impl( sumf1[row] += d1 * (scales[1] - 32); sumf2[row] += d2 * (scales[3] - 32); - q += step; - h += step; - a += step; - dh += step; - + q += nb01/2; + h += nb01/2; + a += nb01/2; + dh += nb01/2; } y1 += 4 * QK_K; - } for (int row = 0; row < 2; ++row) { @@ -3706,12 +3779,14 @@ kernel void kernel_mul_mv_q3_K_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3720,7 +3795,7 @@ kernel void kernel_mul_mv_q3_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q4_K_f32_impl( @@ -3730,8 +3805,14 @@ void kernel_mul_mv_q4_K_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -3756,29 +3837,26 @@ void kernel_mul_mv_q4_K_f32_impl( const int im = tgpig.z; //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int first_row = r0 * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_q4_K * x = (device const block_q4_K *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[16]; float yh[16]; float sumf[N_DST]={0.f}, all_sum; - const int step = sizeof(block_q4_K) * nb / 2; - device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; uint16_t sc16[4]; thread const uint8_t * sc8 = (thread const uint8_t *)sc16; for (int ib = ix; ib < nb; ib += 4) { - float4 sumy = {0.f, 0.f, 0.f, 0.f}; for (int i = 0; i < 8; ++i) { yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0]; @@ -3792,7 +3870,6 @@ void kernel_mul_mv_q4_K_f32_impl( device const half * dh = &x[ib].d; for (int row = 0; row < N_DST; row++) { - sc16[0] = sc[0] & kmask1; sc16[1] = sc[2] & kmask1; sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); @@ -3821,9 +3898,9 @@ void kernel_mul_mv_q4_K_f32_impl( (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) - dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); - q1 += step; - sc += step; - dh += step; + q1 += nb01/2; + sc += nb01/2; + dh += nb01/2; } y4 += 4 * QK_K; @@ -3848,12 +3925,14 @@ kernel void kernel_mul_mv_q4_K_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3862,7 +3941,7 @@ kernel void kernel_mul_mv_q4_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q5_K_f32_impl( @@ -3872,8 +3951,14 @@ void kernel_mul_mv_q5_K_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -3894,15 +3979,14 @@ void kernel_mul_mv_q5_K_f32_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_q5_K * x = (device const block_q5_K *) ((device char *) src0 + offset0); + device const float * yy = (device const float *) ((device char *) src1 + offset1); float sumf[2]={0.f}; - const int step = sizeof(block_q5_K) * nb; - float yl[16], yh[16]; const uint16_t kmask1 = 0x3f3f; @@ -3930,7 +4014,6 @@ void kernel_mul_mv_q5_K_f32_impl( device const float * y1 = yy + ix*QK_K + y_offset; for (int i = ix; i < nb; i += 4) { - device const uint8_t * q1 = x[i].qs + q_offset; device const uint8_t * qh = x[i].qh + l0; device const half * dh = &x[i].d; @@ -3946,7 +4029,6 @@ void kernel_mul_mv_q5_K_f32_impl( } for (int row = 0; row < 2; ++row) { - device const uint8_t * q2 = q1 + 64; sc16[0] = a[0] & kmask1; @@ -3975,15 +4057,13 @@ void kernel_mul_mv_q5_K_f32_impl( sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) - dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); - q1 += step; - qh += step; - dh += step/2; - a += step/2; - + q1 += nb01; + qh += nb01; + dh += nb01/2; + a += nb01/2; } y1 += 4 * QK_K; - } for (int row = 0; row < 2; ++row) { @@ -4005,12 +4085,14 @@ kernel void kernel_mul_mv_q5_K_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4019,7 +4101,7 @@ kernel void kernel_mul_mv_q5_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q6_K_f32_impl( @@ -4029,8 +4111,14 @@ void kernel_mul_mv_q6_K_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4056,10 +4144,11 @@ void kernel_mul_mv_q6_K_f32_impl( const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_q6_K * x = (device const block_q6_K *) ((device char *) src0 + offset0); + device const float * yy = (device const float *) ((device char *) src1 + offset1); float sumf = 0; @@ -4115,12 +4204,14 @@ kernel void kernel_mul_mv_q6_K_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4129,7 +4220,7 @@ kernel void kernel_mul_mv_q6_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } // ======================= "True" 2-bit @@ -4141,8 +4232,14 @@ void kernel_mul_mv_iq2_xxs_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4158,15 +4255,15 @@ void kernel_mul_mv_iq2_xxs_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_iq2_xxs * x = (device const block_iq2_xxs *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4219,8 +4316,8 @@ void kernel_mul_mv_iq2_xxs_f32_impl( } sumf[row] += d * sum; - dh += nb*sizeof(block_iq2_xxs)/2; - q2 += nb*sizeof(block_iq2_xxs)/2; + dh += nb01/2; + q2 += nb01/2; } y4 += 32 * 32; @@ -4245,12 +4342,14 @@ kernel void kernel_mul_mv_iq2_xxs_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4260,7 +4359,7 @@ kernel void kernel_mul_mv_iq2_xxs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } void kernel_mul_mv_iq2_xs_f32_impl( @@ -4270,8 +4369,14 @@ void kernel_mul_mv_iq2_xs_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4287,15 +4392,15 @@ void kernel_mul_mv_iq2_xs_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_iq2_xs * x = (device const block_iq2_xs *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4357,9 +4462,9 @@ void kernel_mul_mv_iq2_xs_f32_impl( } sumf[row] += d1 * sum1 + d2 * sum2; - dh += nb*sizeof(block_iq2_xs)/2; - q2 += nb*sizeof(block_iq2_xs)/2; - sc += nb*sizeof(block_iq2_xs); + dh += nb01/2; + q2 += nb01/2; + sc += nb01; } y4 += 32 * 32; @@ -4384,12 +4489,14 @@ kernel void kernel_mul_mv_iq2_xs_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4399,7 +4506,7 @@ kernel void kernel_mul_mv_iq2_xs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } void kernel_mul_mv_iq3_xxs_f32_impl( @@ -4409,8 +4516,14 @@ void kernel_mul_mv_iq3_xxs_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4426,15 +4539,15 @@ void kernel_mul_mv_iq3_xxs_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_iq3_xxs * x = (device const block_iq3_xxs *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4489,9 +4602,9 @@ void kernel_mul_mv_iq3_xxs_f32_impl( } sumf[row] += d * (sum[0] + sum[1]); - dh += nb*sizeof(block_iq3_xxs)/2; - q3 += nb*sizeof(block_iq3_xxs); - gas += nb*sizeof(block_iq3_xxs)/2; + dh += nb01/2; + q3 += nb01; + gas += nb01/2; } y4 += 32 * 32; @@ -4516,12 +4629,14 @@ kernel void kernel_mul_mv_iq3_xxs_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4531,7 +4646,7 @@ kernel void kernel_mul_mv_iq3_xxs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } void kernel_mul_mv_iq3_s_f32_impl( @@ -4541,8 +4656,14 @@ void kernel_mul_mv_iq3_s_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4558,15 +4679,15 @@ void kernel_mul_mv_iq3_s_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_iq3_s * x = (device const block_iq3_s *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4619,11 +4740,11 @@ void kernel_mul_mv_iq3_s_f32_impl( } sumf[row] += d * (sum[0] + sum[1]); - dh += nb*sizeof(block_iq3_s)/2; - qs += nb*sizeof(block_iq3_s); - qh += nb*sizeof(block_iq3_s); - sc += nb*sizeof(block_iq3_s); - signs += nb*sizeof(block_iq3_s); + dh += nb01/2; + qs += nb01; + qh += nb01; + sc += nb01; + signs += nb01; } y4 += 32 * 32; @@ -4648,12 +4769,14 @@ kernel void kernel_mul_mv_iq3_s_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4663,7 +4786,7 @@ kernel void kernel_mul_mv_iq3_s_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } void kernel_mul_mv_iq2_s_f32_impl( @@ -4673,8 +4796,14 @@ void kernel_mul_mv_iq2_s_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4690,15 +4819,15 @@ void kernel_mul_mv_iq2_s_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; - device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + device const block_iq2_s * x = (device const block_iq2_s *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4752,11 +4881,11 @@ void kernel_mul_mv_iq2_s_f32_impl( } sumf[row] += d1 * sum[0] + d2 * sum[1]; - dh += nb*sizeof(block_iq2_s)/2; - qs += nb*sizeof(block_iq2_s); - qh += nb*sizeof(block_iq2_s); - sc += nb*sizeof(block_iq2_s); - signs += nb*sizeof(block_iq2_s); + dh += nb01/2; + qs += nb01; + qh += nb01; + sc += nb01; + signs += nb01; } y4 += 32 * 32; @@ -4781,12 +4910,14 @@ kernel void kernel_mul_mv_iq2_s_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4796,7 +4927,7 @@ kernel void kernel_mul_mv_iq2_s_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } void kernel_mul_mv_iq1_s_f32_impl( @@ -4806,8 +4937,14 @@ void kernel_mul_mv_iq1_s_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4823,14 +4960,15 @@ void kernel_mul_mv_iq1_s_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const block_iq1_s * x = (device const block_iq1_s *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4873,9 +5011,9 @@ void kernel_mul_mv_iq1_s_f32_impl( } sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1); - dh += nb*sizeof(block_iq1_s)/2; - qs += nb*sizeof(block_iq1_s); - qh += nb*sizeof(block_iq1_s)/2; + dh += nb01/2; + qs += nb01; + qh += nb01/2; } y4 += 32 * 32; @@ -4896,8 +5034,14 @@ void kernel_mul_mv_iq1_m_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -4913,14 +5057,15 @@ void kernel_mul_mv_iq1_m_f32_impl( const int im = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const block_iq1_m * x = (device const block_iq1_m *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -4972,9 +5117,9 @@ void kernel_mul_mv_iq1_m_f32_impl( sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); - sc += nb*sizeof(block_iq1_m)/2; - qs += nb*sizeof(block_iq1_m); - qh += nb*sizeof(block_iq1_m); + sc += nb01/2; + qs += nb01; + qh += nb01; } y4 += 32 * 32; @@ -4995,8 +5140,14 @@ void kernel_mul_mv_iq4_nl_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -5012,14 +5163,15 @@ void kernel_mul_mv_iq4_nl_f32_impl( const int r1 = tgpig.y; const int im = tgpig.z; const int first_row = (r0 * 2 + sgitg) * 2; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const block_iq4_nl * x = (device const block_iq4_nl *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); const int ix = tiisg/2; // 0...15 const int it = tiisg%2; // 0 or 1 @@ -5089,8 +5241,14 @@ void kernel_mul_mv_iq4_xs_f32_impl( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -5106,14 +5264,15 @@ void kernel_mul_mv_iq4_xs_f32_impl( const int r1 = tgpig.y; const int im = tgpig.z; const int first_row = (r0 * 2 + sgitg) * 2; - const int ib_row = first_row * nb; const uint i12 = im%ne12; const uint i13 = im/ne12; - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + const uint offset0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + const uint offset1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + + device const block_iq4_xs * x = (device const block_iq4_xs *) ((device char *) src0 + offset0); + device const float * y = (device const float *) ((device char *) src1 + offset1); const int ix = tiisg/16; // 0 or 1 const int it = tiisg%16; // 0...15 @@ -5188,12 +5347,14 @@ kernel void kernel_mul_mv_iq1_s_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -5202,7 +5363,7 @@ kernel void kernel_mul_mv_iq1_s_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } [[host_name("kernel_mul_mv_iq1_m_f32")]] @@ -5216,12 +5377,14 @@ kernel void kernel_mul_mv_iq1_m_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -5230,7 +5393,7 @@ kernel void kernel_mul_mv_iq1_m_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); + kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } [[host_name("kernel_mul_mv_iq4_nl_f32")]] @@ -5244,12 +5407,14 @@ kernel void kernel_mul_mv_iq4_nl_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -5259,7 +5424,7 @@ kernel void kernel_mul_mv_iq4_nl_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } [[host_name("kernel_mul_mv_iq4_xs_f32")]] @@ -5273,12 +5438,14 @@ kernel void kernel_mul_mv_iq4_xs_f32( constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -5288,7 +5455,7 @@ kernel void kernel_mul_mv_iq4_xs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); + kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb01, nb02, nb03, ne10, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } //============================= templates and their specializations ============================= @@ -5833,10 +6000,12 @@ kernel void kernel_mul_mm(device const uchar * src0, constant int64_t & ne02, constant uint64_t & nb01, constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -5873,12 +6042,13 @@ kernel void kernel_mul_mm(device const uchar * src0, const uint i12 = im%ne12; const uint i13 = im/ne12; - uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); + uint offset0 = (i12/r2)*nb02 + (i13/r3)*nb03; ushort offset1 = il/nl; device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; device const float * y = (device const float *)(src1 - + nb12 * im + + nb13 * i13 + + nb12 * i12 + nb11 * (r1 * BLOCK_SIZE_N + thread_col) + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); @@ -6257,12 +6427,14 @@ typedef void (kernel_mul_mv_impl_t)( uint64_t nb00, uint64_t nb01, uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne11, int64_t ne12, uint64_t nb10, uint64_t nb11, uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -6277,8 +6449,14 @@ typedef void (kernel_mul_mv2_impl_t)( int64_t ne00, int64_t ne01, int64_t ne02, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne12, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint r2, @@ -6299,6 +6477,7 @@ void mmv_fn( uint64_t nb00, uint64_t nb01, uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne11, int64_t ne12, @@ -6306,6 +6485,7 @@ void mmv_fn( uint64_t nb10, uint64_t nb11, uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint64_t nb1, @@ -6316,7 +6496,7 @@ void mmv_fn( uint tiitg, uint tiisg, uint sgitg) { - impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,ne10,ne11,ne12,nb10,nb11,nb12,ne0,ne1,r2,r3,tgpig,tiisg); + impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,nb03,ne10,ne11,ne12,nb10,nb11,nb12,nb13,ne0,ne1,r2,r3,tgpig,tiisg); } template @@ -6330,6 +6510,7 @@ void mmv_fn( uint64_t nb00, uint64_t nb01, uint64_t nb02, + uint64_t nb03, int64_t ne10, int64_t ne11, int64_t ne12, @@ -6337,6 +6518,7 @@ void mmv_fn( uint64_t nb10, uint64_t nb11, uint64_t nb12, + uint64_t nb13, int64_t ne0, int64_t ne1, uint64_t nb1, @@ -6347,7 +6529,7 @@ void mmv_fn( uint tiitg, uint tiisg, uint sgitg) { - impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg); + impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,nb01,nb02,nb03,ne10,ne12,nb11,nb12,nb13,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg); } typedef decltype(mmv_fn>) mul_mv_impl_fn_t; @@ -6396,8 +6578,8 @@ kernel void kernel_mul_mv_id( const int64_t i2 = i12; device const char * src0_cur = src0s + i02*nb02; - device const char * src1_cur = src1 + i11*nb11 + i12*nb12; - device float * dst_cur = dst + i1*ne0 + i2*ne1*ne0; + device const char * src1_cur = src1 + i11*nb11 + i12*nb12; + device float * dst_cur = dst + i1*ne0 + i2*ne1*ne0; impl_fn( /* src0 */ src0_cur, @@ -6405,19 +6587,21 @@ kernel void kernel_mul_mv_id( /* dst */ dst_cur, /* ne00 */ ne00, /* ne01 */ ne01, - /* ne02 */ 1,//ne02, + /* ne02 */ 1, // ne02, /* nb00 */ nb00, /* nb01 */ nb01, /* nb02 */ nb02, + /* nb03 */ nb02, // ne02 == 1 /* ne10 */ ne10, - /* ne11 */ 1,//ne11, - /* ne12 */ 1,//ne12, - /* ne13 */ 1,//ne13, + /* ne11 */ 1, // ne11, + /* ne12 */ 1, // ne12, + /* ne13 */ 1, // ne13, /* nb10 */ nb10, /* nb11 */ nb11, /* nb12 */ nb12, + /* ne13 */ nb12, // ne12 == 1 /* ne0 */ ne0, - /* ne1 */ 1,//ne1, + /* ne1 */ 1, // ne1, /* nb1 */ nb1, /* r2 */ 1, /* r3 */ 1, diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index 0193a463aefec..9eead3f61e090 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -942,6 +942,36 @@ class tinyBLAS_Q0_AVX { return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); } + inline __m256i load(const block_q5_0 *b) { + return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh)); + } + + inline __m128i load0(const block_q5_0* b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + uint32_t x32; + memcpy(&x32, b->qh, sizeof(uint32_t)); + __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x); + __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1), + _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm_shuffle_epi8(_mm_set1_epi32(x32), + _mm_set_epi64x(0x0101010101010101, 0x0000000000000000)))); + bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0)); + return _mm_or_si128(qxl, bytesl); + } + + inline __m128i load1(const block_q5_0* b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + uint32_t x32; + memcpy(&x32, b->qh, sizeof(uint32_t)); + __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)); + __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1), + _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm_shuffle_epi8(_mm_set1_epi32(x32), + _mm_set_epi64x(0x0303030303030303, 0x0202020202020202)))); + bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0)); + return _mm_or_si128(qxh, bytesh); + } + inline __m256i load(const block_iq4_nl *b) { return MM256_SET_M128I(load1(b), load0(b)); } @@ -973,6 +1003,17 @@ class tinyBLAS_Q0_AVX { _mm_srli_epi16(x, 4), 1)); } + static inline __m256i bittobyte(const uint8_t *p) { + uint32_t x32; + memcpy(&x32, p, sizeof(uint32_t)); + __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1), + _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe), + _mm256_shuffle_epi8(_mm256_set1_epi32(x32), + _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000)))); + return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0)); + } + const TA *const A; const TB *const B; TC *const C; @@ -1182,6 +1223,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_Q5_0: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_q5_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + case GGML_TYPE_IQ4_NL: { if (Btype != GGML_TYPE_Q8_0) return false; diff --git a/include/llama.h b/include/llama.h index d4059c8dd0431..5837d74bc415b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1102,6 +1102,9 @@ extern "C" { /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + + LLAMA_API struct llama_sampler * llama_sampler_init_k_shift (int32_t k); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -1141,6 +1144,16 @@ extern "C" { bool penalize_nl, // consider newlines as a repeatable token bool ignore_eos); // ignore the end-of-sequence token + /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 + LLAMA_API struct llama_sampler * llama_sampler_init_dry( + const struct llama_model * model, + float dry_multiplier, + float dry_base, + int32_t dry_allowed_length, + int32_t dry_penalty_last_n, + const char ** seq_breakers, + size_t num_breakers); + LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, int32_t n_logit_bias, diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index ffce2aab0918e..fba29b9352e68 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -76,6 +76,7 @@ while read c; do src/ggml*.m \ src/ggml*.metal \ src/ggml*.cu \ + src/ggml-amx/* \ src/ggml-cann/* \ src/ggml-cuda/* \ src/ggml-sycl/* \ @@ -121,6 +122,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h # src/ggml-alloc.c -> ggml/src/ggml-alloc.c + # src/ggml-amx/* -> ggml/src/ggml-amx/ + # src/ggml-amx.cpp -> ggml/src/ggml-amx.cpp # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend.cpp -> ggml/src/ggml-backend.cpp # src/ggml-cann/* -> ggml/src/ggml-cann/ @@ -141,6 +144,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # # include/ggml.h -> ggml/include/ggml.h # include/ggml-alloc.h -> ggml/include/ggml-alloc.h + # include/ggml-amx.h -> ggml/include/ggml-amx.h # include/ggml-backend.h -> ggml/include/ggml-backend.h # include/ggml-blas.h -> ggml/include/ggml-blas.h # include/ggml-cann.h -> ggml/include/ggml-cann.h @@ -168,6 +172,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\.cpp/\1ggml\/src\/ggml-amx.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.cpp/\1ggml\/src\/ggml-backend.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ @@ -187,6 +193,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-amx\.h/\1ggml\/include\/ggml-amx.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7f689f6328879..da40927e196cd 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -6dccc647264f5429df2624f36138f601e7ce23e5 +162e232411ee98ceb0cccfa84886118d917d2123 diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index f6ff5e68354f1..f5d87324ab366 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -8,6 +8,8 @@ cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c +cp -rpv ../ggml/src/ggml-amx/* ./ggml/src/ggml-amx/ +cp -rpv ../ggml/src/ggml-amx.cpp ./ggml/src/ggml-amx.cpp cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.cpp ./ggml/src/ggml-backend.cpp cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ @@ -29,6 +31,7 @@ cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h +cp -rpv ../ggml/include/ggml-amx.h ./ggml/include/ggml-amx.h cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d71516153cf82..ba370147225c1 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -188,6 +188,17 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +static void llama_sampler_top_shift_impl(llama_token_data_array * cur_p, int k) { + // sort before shifting + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + + // shift to a token #[k] + cur_p->data += k; + cur_p->size -= k; +} + static uint32_t get_rng_seed(uint32_t seed) { if (seed == LLAMA_DEFAULT_SEED) { // use system clock if std::random_device is not a true RNG @@ -1177,6 +1188,64 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, }; } +// k-shift + +struct llama_sampler_k_shift { + const int32_t k; + bool k_set; +}; + +static const char * llama_sampler_k_shift_name(const struct llama_sampler * /*smpl*/) { + return "k-shift"; +} + +static void llama_sampler_k_shift_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_k_shift *) smpl->ctx; + + if (ctx->k_set == true + || ctx->k <= 0 + || ctx->k >= (int) cur_p->size) { + return; + } + + llama_sampler_top_shift_impl(cur_p, ctx->k); + ctx->k_set = true; +} + +static struct llama_sampler * llama_sampler_k_shift_clone(const struct llama_sampler * smpl) { + auto * ctx = (const llama_sampler_k_shift *) smpl->ctx; + + return llama_sampler_init_k_shift(ctx->k); +} + +static void llama_sampler_k_shift_free(struct llama_sampler * smpl) { + delete (llama_sampler_k_shift *) smpl->ctx; +} + +static void llama_sampler_k_shift_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_k_shift *) smpl->ctx; + ctx->k_set = false; +} + +static struct llama_sampler_i llama_sampler_k_shift_i = { + /* .name = */ llama_sampler_k_shift_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_k_shift_apply, + /* .reset = */ llama_sampler_k_shift_reset, + /* .clone = */ llama_sampler_k_shift_clone, + /* .free = */ llama_sampler_k_shift_free, +}; + +struct llama_sampler * llama_sampler_init_k_shift(int32_t k) { + return new llama_sampler { + /* .iface = */ &llama_sampler_k_shift_i, + /* .ctx = */ new llama_sampler_k_shift { + /* .k = */ k, + /* .k_set = */ false, + }, + }; +} + // mirostat struct llama_sampler_mirostat { @@ -1683,6 +1752,397 @@ struct llama_sampler * llama_sampler_init_penalties( }; } +// DRY + +struct llama_sampler_dry { + int32_t total_context_size; + + const float dry_multiplier; + const float dry_base; + const int32_t dry_allowed_length; + const int32_t dry_penalty_last_n; + + std::unordered_multimap> dry_processed_breakers; + std::vector dry_repeat_count; + std::unordered_map dry_max_token_repeat; + ring_buffer last_tokens; +}; + +// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) +static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { + for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) { + std::string word = llama_detokenize(vocab, {token_id}, true); + if (word.find(str) != std::string::npos) { + token_sequences.emplace(token_id, std::vector()); + } else { + size_t word_len = word.size(), str_len = str.size(); + size_t pos = -1; + while ((pos = word.find(str[0], pos + 1)) != std::string::npos) { + bool match = true; + size_t i; + for (i = 1; i < str_len && i + pos < word_len; ++i) { + if (word[pos + i] != str[i]) { + match = false; + break; + } + } + if (match) { + std::vector tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false); + if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) { + tokenization.resize(max_tail_len); + } + + // Ensure we don't already have a duplicate matching tokenization + auto its = token_sequences.equal_range(token_id); + bool found = false; + for (auto it = its.first; it != its.second; ++it) { + if (tokenization == it->second) { + found = true; + break; + } + } + if (!found) { + token_sequences.emplace(token_id, tokenization); + } + } + } + } + } +} + +static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) { + return "dry"; +} + +static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { + return; + } + + ctx->last_tokens.push_back(token); +} + +// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) +static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + + if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) { + return; + } + + int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0); + int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size); + + if (last_n_repeat <= ctx->dry_allowed_length) { + return; + } + + ctx->dry_repeat_count.assign(last_n_repeat, 0); + ctx->dry_max_token_repeat.clear(); + + // Step 1: Look for restart sequences to limit the maximum repetition length. + // Work backwards through the context looking for any token that begins a restart sequence. + // + // The collection `restart_sequences` is a mapping from a "head" token to all "tail" + // sequences that together comprise a restart sequence. This allows us to quickly check + // whether each token is the head of a complete sequence. Most restart sequences are actually + // a single token, and for these the "tail" is an empty vector. + // + // If the token is a "head", test all restart sequences that begin with this token + // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and + // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The + // longest matching sequence (if any) is used to limit the maximum repetition length. + // + // Note that in the case case of a short sequence contained in a longer one, this might fail to + // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as + // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress + // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare. + // + // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we + // have already clamped the maximum tail sequence length when generating `restart_sequences`. + // With clamping, this scan is O(N) in the context length. + + int rep_limit = last_n_repeat; + for (int i = 0; i < last_n_repeat; ++i) { + llama_token token = ctx->last_tokens.rat(i); + auto its = ctx->dry_processed_breakers.equal_range(token); + if (its.first == ctx->dry_processed_breakers.end()) { + continue; + } + int longest_match = -1; + for (auto it = its.first; it != its.second; ++it) { + // Note that (*it) does not contain the head character, so seq_len will be + // the restart sequence length minus 1. + // In the common case of a single-token restart sequence, (*it) will be empty + // and we will trivially match. + int seq_len = (int)it->second.size(); + if (seq_len > longest_match && seq_len <= (int)i) { + bool match = true; + for (int offset = 0; offset < seq_len; ++offset) { + // The -1 when indexing `last_tokens` is because we already matched the head. + if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) { + match = false; + break; + } + } + if (match) { + longest_match = seq_len; + } + } + } + if (longest_match >= 0) { + // We found a restart sequence starting `i` tokens from the end and continuing for + // `longest_match` tokens. + rep_limit = i - longest_match; + break; + } + } + if (rep_limit < ctx->dry_allowed_length) { + return; + } + + // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in + // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing + // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences. + // + // This algorithm is not currently documented on Wikipedia, but there is a clear description here: + // https://ivanyu.me/blog/2014/10/15/z-algorithm/ + // + // The code below is adapted from the public domain implementation by the same author here: + // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py + // + // Example: + // Last N tokens: a b c c b c y a b c + // Repeat counts: 0 0 3 1 0 2 0 0 0 0 + // ^ + // This `3` means that the last three tokens of the context (a b c) also appear here. + // + // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested + // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each + // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables + // ensure that the inner while loops only examine each token in the context once as the outer + // for loop iterates over the context. + + { + const int last = last_n_repeat - 1; + int rt = 0, lt = 0; + + for (int k = 1; k < last_n_repeat; ++k) { + if (k > rt) { + // If k is outside the current Z-box, do naive computation. + int n = 0; + while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) { + ++n; + } + ctx->dry_repeat_count[last - k] = std::min(n, rep_limit); + if (n > 0) { + lt = k; + rt = k+n-1; + } + } else { + // If k is inside the current Z-box, consider two cases. + + int p = k - lt; // Pair index. + int right_part_len = rt - k + 1; + + if (ctx->dry_repeat_count[last - p] < right_part_len) { + int n = std::min(ctx->dry_repeat_count[last - p], rep_limit); + ctx->dry_repeat_count[last - k] = n; + } else { + int i = rt + 1; + while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) { + i += 1; + } + + int n = std::min(i - k, rep_limit); + ctx->dry_repeat_count[last - k] = n; + lt = k; + rt = i - 1; + } + } + } + } + + // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length + // that would be generated by emitting each new token that would extend a sequence. + // + // Following the same example as above: + // Last N tokens: a b c c b c y a b c + // Repeat counts: 0 0 3 1 0 2 0 0 0 0 + // + // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition. + // c: 3 -> 4 (from `a b c` to `a b c c`) + // b: 1 -> 2 (from `c` to `c b`) + // y: 2 -> 3 (from `b c` to `b c y`) + + for (int i = 0; i < last_n_repeat - 1; ++i) { + int repeat_len = ctx->dry_repeat_count[i]; + if (repeat_len >= ctx->dry_allowed_length) { + // This token ends a repeat, so the next token would continue one. + // By convention, the value of `repeat_len` only includes the tokens currently + // in the context, not the new token that would be added. + llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i); + // Track the maximum sequence ending in this token. + const auto& it = ctx->dry_max_token_repeat.find(token); + if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) { + ctx->dry_max_token_repeat[token] = repeat_len; + } + } + } + + // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens. + + // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`. + // Compute it from `penalty_base` and the approximate log of `std::numeric_limits::max()` + const float FLOAT_MAX_LOG = 88.7228391f; + int max_exponent = 0; + if (ctx->dry_base > 1.000001f) { + max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base); + } + + for (size_t i = 0; i < cur_p->size; ++i) { + const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id); + if (af_kvp != ctx->dry_max_token_repeat.end()) { + // Check all sequence breakers starting with this token + auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id); + bool is_single_token_breaker = false; + + for (auto it = range.first; it != range.second; ++it) { + if (it->second.empty()) { + is_single_token_breaker = true; + break; + } + } + + // Apply penalty only if it's not a single-token sequence breaker + if (!is_single_token_breaker) { + int repeat_exp = af_kvp->second - ctx->dry_allowed_length; + if (max_exponent > 0 && repeat_exp > max_exponent) { + repeat_exp = max_exponent; + } + float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp); + cur_p->data[i].logit -= penalty; + } + } + } + + cur_p->sorted = false; +} + +static void llama_sampler_dry_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dry *) smpl->ctx; + ctx->last_tokens.clear(); + ctx->dry_repeat_count.clear(); + ctx->dry_max_token_repeat.clear(); +} + +static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) { + const auto * ctx = (llama_sampler_dry *) smpl->ctx; + + // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying + auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); + // Copy the state, including the processed breakers + { + auto * result_ctx = (llama_sampler_dry *) result->ctx; + result_ctx->dry_processed_breakers = ctx->dry_processed_breakers; + result_ctx->dry_repeat_count = ctx->dry_repeat_count; + result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat; + result_ctx->last_tokens = ctx->last_tokens; + } + + return result; +} + +static void llama_sampler_dry_free(struct llama_sampler * smpl) { + delete (llama_sampler_dry *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_dry_i = { + /* .name = */ llama_sampler_dry_name, + /* .accept = */ llama_sampler_dry_accept, + /* .apply = */ llama_sampler_dry_apply, + /* .reset = */ llama_sampler_dry_reset, + /* .clone = */ llama_sampler_dry_clone, + /* .free = */ llama_sampler_dry_free, +}; + +struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) { + int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0); + std::unordered_multimap> processed_breakers; + const int MAX_CHAR_LEN = 40; + const int MAX_SEQ_LEN = 20; + + const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0); + + if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) { + // Process sequence breakers + for (size_t i = 0; i < num_breakers; ++i) { + if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) { + LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i); + continue; + } + + std::string sequence_break(seq_breakers[i]); + if (sequence_break.empty()) { + LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n"); + continue; + } + + if (sequence_break.size() > MAX_CHAR_LEN) { + LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN); + sequence_break.resize(MAX_CHAR_LEN); + } + + get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN); + } + } + + return new llama_sampler { + /* .iface = */ &llama_sampler_dry_i, + /* .ctx = */ new llama_sampler_dry { + /* .total_context_size = */ context_size, + /* .dry_multiplier = */ dry_multiplier, + /* .dry_base = */ dry_base, + /* .dry_allowed_length = */ dry_allowed_length, + /* .dry_penalty_last_n = */ dry_penalty_last_n, + /* .dry_processed_breakers = */ std::move(processed_breakers), + /* .dry_repeat_count = */ dry_enabled ? std::vector(effective_dry_penalty_last_n, 0) : std::vector{}, + /* .dry_max_token_repeat = */ {}, + /* .last_tokens = */ dry_enabled ? ring_buffer(effective_dry_penalty_last_n) : ring_buffer(0), + }, + }; +} + +// wrapper for test-sampling.cpp +struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector>& seq_breakers) { + llama_vocab dummy_vocab; + auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0); + auto * ctx = (llama_sampler_dry *) result->ctx; + + // Process the token-based sequence breakers + ctx->dry_processed_breakers.clear(); + if (seq_breakers.empty()) { + LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n"); + } else { + for (const auto& breaker : seq_breakers) { + if (breaker.empty()) { + LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n"); + continue; + } + llama_token head_token = breaker[0]; + std::vector tail_tokens(breaker.begin() + 1, breaker.end()); + ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens)); + } + + if (ctx->dry_processed_breakers.empty()) { + LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n"); + } + } + + return result; +} + // logit-bias struct llama_sampler_logit_bias { diff --git a/src/llama-sampling.h b/src/llama-sampling.h index 2683f1b92696f..919f6fdfcefb8 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -28,3 +28,21 @@ struct llama_sampler * llama_sampler_init_grammar_impl( struct llama_sampler * llama_sampler_init_infill_impl( const struct llama_vocab & vocab); + +struct llama_sampler * llama_sampler_init_dry_impl( + const struct llama_vocab & vocab, + int32_t context_size, + float dry_multiplier, + float dry_base, + int32_t dry_allowed_length, + int32_t dry_penalty_last_n, + const char ** seq_breakers, + size_t num_breakers); + +struct llama_sampler * llama_sampler_init_dry_testing( + int32_t context_size, + float dry_multiplier, + float dry_base, + int32_t dry_allowed_length, + int32_t dry_penalty_last_n, + const std::vector>& seq_breakers); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0a49ddbe3e291..d1dc96276c2a2 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1966,3 +1966,19 @@ int32_t llama_detokenize_impl( return total <= text_len_max ? total : -total; } + +std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector & tokens, bool special) { + std::string text; + text.resize(std::max(text.capacity(), tokens.size())); + int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + if (n_chars < 0) { + text.resize(-n_chars); + n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization + } + + text.resize(n_chars); + + // NOTE: the original tokenizer decodes bytes after collecting the pieces. + return text; +} diff --git a/src/llama-vocab.h b/src/llama-vocab.h index d958d0073be95..4bb16d2e4299f 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -163,3 +163,8 @@ int32_t llama_detokenize_impl( int32_t text_len_max, bool remove_special, bool unparse_special); + +std::string llama_detokenize( + const struct llama_vocab & vocab, + const std::vector & tokens, + bool special); diff --git a/src/llama.cpp b/src/llama.cpp index 24e1f1f01a857..50eebc2c298f5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21775,6 +21775,10 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_model * mode return llama_sampler_init_infill_impl(model->vocab); } +struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) { + return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers); +} + // // model split // diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 05600e6f54e90..22f74ac4cfe1d 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -10,6 +10,8 @@ #include #include +extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector>& seq_breakers); + static void dump(const llama_token_data_array * cur_p) { for (size_t i = 0; i < cur_p->size; i++) { printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); @@ -81,6 +83,17 @@ static void test_temp_ext(const std::vector & probs, const std::vector & probs, const std::vector & probs_expected, int k) { + sampler_tester tester(probs, probs_expected); + + DUMP(&tester.cur_p); + tester.apply(llama_sampler_init_k_shift(k)); + tester.apply(llama_sampler_init_dist (0)); + DUMP(&tester.cur_p); + + tester.check(); +} + static void test_top_k(const std::vector & probs, const std::vector & probs_expected, int k) { sampler_tester tester(probs, probs_expected); @@ -167,6 +180,29 @@ static void test_penalties( tester.check(); } +static void test_dry( + const std::vector & probs, const std::vector & last_tokens, + const std::vector & expected_probs, float dry_multiplier, float dry_base, + int dry_allowed_length, int dry_penalty_last_n, + const std::vector> & seq_breakers +) { + GGML_ASSERT(probs.size() == expected_probs.size()); + + sampler_tester tester(probs, expected_probs); + + auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers); + + for (size_t i = 0; i < last_tokens.size(); i++) { + llama_sampler_accept(sampler, last_tokens[i]); + } + + DUMP(&tester.cur_p); + tester.apply(sampler); + tester.apply(llama_sampler_init_dist(0)); + DUMP(&tester.cur_p); + tester.check(); +} + static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p ) { sampler_tester tester(n_vocab); @@ -274,6 +310,7 @@ static void test_perf() { data.emplace_back(llama_token_data{i, logit, 0.0f}); } + BENCH(llama_sampler_init_k_shift (10), data, 32); BENCH(llama_sampler_init_top_k (40), data, 32); BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32); BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32); @@ -291,6 +328,12 @@ int main(void) { test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f); test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f); + test_k_shift({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); + test_k_shift({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 3); + test_k_shift({0.1f, 0.2f, 0.3f, 0.4f}, {0.66666f, 0.33333f}, 2); + test_k_shift({0.1f, 0.2f, 0.3f, 0.4f}, {0.5f, 0.33333f, 0.16666f}, 1); + test_k_shift({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); @@ -333,6 +376,13 @@ int main(void) { test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); + + test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {}); + test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {}); + test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}}); + test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {}); + test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {}); + test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f); test_sampler_queue(10000, "k", 1, 1.0f, 1.0f); test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);