-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Judd
committed
Jul 6, 2024
1 parent
493846f
commit 8173f62
Showing
7 changed files
with
107 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
namespace v2 | ||
{ | ||
struct Config : public glm::v2::Config | ||
{ | ||
}; | ||
|
||
class ChatHistoryEncoder : public BaseHistoryEncoder | ||
{ | ||
public: | ||
void do_append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override; | ||
}; | ||
|
||
static ChatHistoryEncoder _chat_encoder; | ||
|
||
class Tokenizer : public glm::v2::Tokenizer | ||
{ | ||
public: | ||
Tokenizer(const Config &config) : glm::v2::Tokenizer::Tokenizer(config, &_chat_encoder) | ||
{ | ||
sys_prompt = "# language: Python"; | ||
} | ||
}; | ||
|
||
class ConditionalGeneration : public glm::v2::ConditionalGeneration | ||
{ | ||
public: | ||
ConditionalGeneration() = default; | ||
ConditionalGeneration(const Config &config) | ||
: glm::v2::ConditionalGeneration(config, MODEL_TYPE_CODEGEEX2) | ||
{ | ||
} | ||
}; | ||
|
||
void ChatHistoryEncoder::do_append_user(int round_idx, const std::string &user, std::vector<int> &ids) const | ||
{ | ||
std::string combined = tokenizer->get_system_prompt() + "\n" + user + "\n"; | ||
tokenizer->encode(combined, ids); | ||
} | ||
} | ||
|
||
namespace v4 | ||
{ | ||
typedef glm::v4::Config Config; | ||
|
||
class Tokenizer : public glm::v4::Tokenizer | ||
{ | ||
public: | ||
Tokenizer(const Config &config) : glm::v4::Tokenizer(config) | ||
{} | ||
|
||
size_t load(tokenizer::DataReader *buffer, int n_vocab) override | ||
{ | ||
size_t r = glm::v4::Tokenizer::load(buffer, n_vocab); | ||
int special_id = observation_token_id + 5; | ||
code_prefix_token_id = special_id++; | ||
code_middle_token_id = special_id++; | ||
code_suffix_token_id = special_id++; | ||
cursor_token_id = special_id++; | ||
tp->AddAddedToken("<|code_prefix|>", code_prefix_token_id); | ||
tp->AddAddedToken("<|code_middle|>", code_middle_token_id); | ||
tp->AddAddedToken("<|code_suffix|>", code_suffix_token_id); | ||
tp->AddAddedToken("<|cursor|>", cursor_token_id); | ||
return r; | ||
} | ||
public: | ||
int code_prefix_token_id; | ||
int code_middle_token_id; | ||
int code_suffix_token_id; | ||
int cursor_token_id; | ||
}; | ||
|
||
class ConditionalGeneration : public glm::v4::ConditionalGeneration | ||
{ | ||
public: | ||
ConditionalGeneration(const Config &config) | ||
: glm::v4::ConditionalGeneration(config, MODEL_TYPE_CODEGEEX4) | ||
{ | ||
} | ||
|
||
// FIXME: this mode seems not support tool calling actually | ||
// https://github.com/THUDM/CodeGeeX4/issues/8 | ||
ChunkInterceptor *get_interceptor(void) override { return nullptr; } | ||
}; | ||
} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters