-
Notifications
You must be signed in to change notification settings - Fork 11k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lookup: evaluation tools, use corpus/previous gens
- Loading branch information
1 parent
f9c7ba3
commit e7d1e38
Showing
13 changed files
with
758 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
#include "ngram-cache.h" | ||
#include "log.h" | ||
|
||
#include <fstream> | ||
|
||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, | ||
std::vector<llama_token> & inp, int nnew, bool print_progress) { | ||
const int64_t t_start_ms = ggml_time_ms(); | ||
const int inp_size = inp.size(); | ||
|
||
for (int ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) { | ||
const int i_start = std::max(inp_size - nnew, ngram_size); | ||
for (int i = i_start; i < inp_size; ++i) { | ||
const int ngram_start = i - ngram_size; | ||
llama_ngram ngram(&inp[ngram_start], ngram_size); | ||
const llama_token token = inp[i]; | ||
|
||
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); | ||
if (part_it == ngram_cache.end()) { | ||
llama_ngram_cache_part part; | ||
part.emplace(token, 1); | ||
ngram_cache.emplace(ngram, part); | ||
} else { | ||
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); | ||
if (token_count_it == part_it->second.end()) { | ||
part_it->second.emplace(token, 1); | ||
} else { | ||
token_count_it->second++; | ||
} | ||
} | ||
if (print_progress && i % 10000000 == 0) { | ||
const int64_t t_now_ms = ggml_time_ms(); | ||
const int64_t eta_ms = (inp_size - i) * (t_now_ms - t_start_ms) / i; | ||
const int64_t eta_min = eta_ms / (60*1000); | ||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; | ||
|
||
fprintf(stderr, "%s: %d/%d done, ETA: %02ld:%02ld\n", __func__, i, inp_size, eta_min, eta_s); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Helper function to get a token from the combined, speculative sequence of inp and draft. | ||
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) { | ||
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; | ||
}; | ||
|
||
// If sample size or percentage are below these thresholds the draft is aborted early: | ||
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; | ||
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; | ||
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; | ||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; | ||
|
||
// Helper function that tries to draft a token from only the static ngram cache: | ||
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { | ||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); | ||
if (part_static_it == nc_static.end()) { | ||
return -1; | ||
} | ||
const llama_ngram_cache_part part_static = part_static_it->second; | ||
|
||
int max_count_static = 0; | ||
int sum_count_static = 0; | ||
llama_token max_token = -1; | ||
|
||
for (std::pair<llama_token, int> token_count_static : part_static) { | ||
const llama_token token = token_count_static.first; | ||
const int32_t count_static = token_count_static.second; | ||
|
||
if (count_static > max_count_static) { | ||
max_token = token; | ||
max_count_static = count_static; | ||
} | ||
sum_count_static += count_static; | ||
} | ||
|
||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { | ||
return -1; | ||
} | ||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { | ||
return -1; | ||
} | ||
return max_token; | ||
} | ||
|
||
// Try to draft a token from primary cache (context/dynamic), validate with static cache: | ||
static llama_token try_draft( | ||
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static, | ||
const int * min_sample_size, const int * min_percent) { | ||
|
||
llama_token drafted_token = -1; | ||
|
||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { | ||
const llama_ngram ngram_primary = ngrams_primary[i]; | ||
|
||
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); | ||
if (part_primary_it == nc_primary.end()) { | ||
continue; | ||
} | ||
const llama_ngram_cache_part part_primary = part_primary_it->second; | ||
|
||
int max_count_primary = 0; | ||
int max_count_static = 0; | ||
int sum_count_primary = 0; | ||
llama_token max_token = -1; | ||
|
||
for (std::pair<llama_token, int> token_count_primary : part_primary) { | ||
const llama_token token = token_count_primary.first; | ||
|
||
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); | ||
|
||
const int32_t count_primary = token_count_primary.second; | ||
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; | ||
|
||
if (count_primary*count_static > max_count_primary*max_count_static) { | ||
max_token = token; | ||
max_count_primary = count_primary; | ||
max_count_static = count_static; | ||
} | ||
sum_count_primary += count_primary; | ||
} | ||
|
||
if (sum_count_primary < min_sample_size[i]) { | ||
continue; | ||
} | ||
if (100*max_count_primary < min_percent[i]*sum_count_primary) { | ||
continue;; | ||
} | ||
drafted_token = max_token; | ||
} | ||
|
||
return drafted_token; | ||
} | ||
|
||
void llama_ngram_cache_draft( | ||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, | ||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static | ||
) { | ||
GGML_ASSERT(draft.size() == 1); | ||
const int inp_size = inp.size(); | ||
|
||
if (inp_size < LLAMA_NGRAM_STATIC) { | ||
return; | ||
} | ||
|
||
while ((int) draft.size()-1 < n_draft) { | ||
llama_token drafted_token = -1; | ||
|
||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; | ||
llama_ngram ngram_static; | ||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { | ||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); | ||
} | ||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); | ||
llama_ngram_cache_part part_static; | ||
if (part_static_it != nc_static.end()) { | ||
part_static = part_static_it->second; | ||
} | ||
|
||
// cd = context + dynamic | ||
std::vector<llama_ngram> ngrams_cd; | ||
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { | ||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; | ||
llama_ngram ngram_cd; | ||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { | ||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); | ||
} | ||
ngrams_cd.push_back(ngram_cd); | ||
} | ||
if (drafted_token == -1) { | ||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); | ||
} | ||
if (drafted_token == -1) { | ||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); | ||
} | ||
if (drafted_token == -1) { | ||
drafted_token = try_draft(nc_static, ngram_static); | ||
} | ||
|
||
if (drafted_token == -1) { | ||
break; | ||
} | ||
|
||
LOG(" - draft candidate: token=%d\n", drafted_token); | ||
draft.push_back(drafted_token); | ||
} | ||
}; | ||
|
||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { | ||
std::ofstream file_out(filename, std::ios::binary); | ||
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) { | ||
const llama_ngram ngram = item.first; | ||
llama_ngram_cache_part token_counts = item.second; | ||
GGML_ASSERT(!token_counts.empty()); | ||
const int32_t ntokens = token_counts.size(); | ||
|
||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram)); | ||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t)); | ||
for (std::pair<llama_token, int32_t> item2 : token_counts) { | ||
const llama_token token = item2.first; | ||
const int32_t count = item2.second; | ||
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token)); | ||
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t)); | ||
} | ||
} | ||
|
||
} | ||
|
||
llama_ngram_cache llama_ngram_cache_load(std::string & filename) { | ||
std::ifstream hashmap_file(filename, std::ios::binary); | ||
if (!hashmap_file) { | ||
throw std::system_error(); | ||
} | ||
llama_ngram_cache ngram_cache; | ||
|
||
llama_ngram ngram; | ||
int32_t ntokens; | ||
llama_token token; | ||
int32_t count; | ||
|
||
char * ngramc = reinterpret_cast<char*>(&ngram); | ||
char * ntokensc = reinterpret_cast<char*>(&ntokens); | ||
char * tokenc = reinterpret_cast<char*>(&token); | ||
char * countc = reinterpret_cast<char*>(&count); | ||
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { | ||
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); | ||
llama_ngram_cache_part token_counts; | ||
|
||
for (int i = 0; i < ntokens; ++i) { | ||
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); | ||
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); | ||
token_counts.emplace(token, count); | ||
} | ||
|
||
ngram_cache.emplace(ngram, token_counts); | ||
} | ||
GGML_ASSERT(hashmap_file.eof()); | ||
|
||
return ngram_cache; | ||
} | ||
|
||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { | ||
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) { | ||
const llama_ngram ngram = ngram_part.first; | ||
llama_ngram_cache_part part = ngram_part.second; | ||
|
||
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); | ||
if (part_merged_it == ngram_cache_target.end()) { | ||
ngram_cache_target.emplace(ngram, part); | ||
continue; | ||
} | ||
|
||
for (std::pair<llama_token, int32_t> token_count : part) { | ||
const llama_token token = token_count.first; | ||
const int32_t count = token_count.second; | ||
|
||
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); | ||
if (token_count_merged_it == part_merged_it->second.end()) { | ||
part_merged_it->second.emplace(token, count); | ||
continue; | ||
} | ||
|
||
token_count_merged_it->second += count; | ||
} | ||
} | ||
} |
Oops, something went wrong.