-
-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
129 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import List | ||
|
||
from transformers_utils.tokenizer import AnyTokenizer | ||
|
||
from vllm.entrypoints.openai.fim.fim_encoder import FIMEncoder | ||
|
||
|
||
class CodeLlamaFIMEncoder(FIMEncoder): | ||
""" | ||
FIM Encoder for Meta CodeLlama models | ||
Adapted from https://github.com/meta-llama/codellama/blob/e81b597e44dbecc2a0dedb9949fdf84adfc22395/llama/generation.py#L474 | ||
""" | ||
|
||
def __init__(self, tokenizer: AnyTokenizer): | ||
super().__init__(tokenizer) | ||
|
||
if not hasattr(tokenizer, "convert_tokens_to_ids"): | ||
raise ValueError( | ||
"tokenizer incompatible with 'codellama' FIM encoder") | ||
|
||
self.bos_id = tokenizer.convert_tokens_to_ids("<s>") | ||
self.prefix_id = tokenizer.convert_tokens_to_ids("▁<PRE>") | ||
self.suffix_id = tokenizer.convert_tokens_to_ids("▁<SUF>") | ||
self.middle_id = tokenizer.convert_tokens_to_ids("▁<MID>") | ||
|
||
unk_token_id = getattr(tokenizer, "unk_token_id", None) | ||
if any(tid in | ||
{self.bos_id, self.prefix_id, self.suffix_id, self.middle_id} | ||
for tid in (None, unk_token_id)): | ||
raise ValueError( | ||
"tokenizer incompatible with 'codellama' FIM encoder") | ||
|
||
def encode_with_suffix(self, prefix: str, suffix: str) -> List[int]: | ||
return ([self.bos_id, self.prefix_id] + | ||
self.tokenizer(prefix, add_special_tokens=False) + | ||
[self.suffix_id] + self._encode_infilling(suffix) + | ||
[self.middle_id]) | ||
|
||
def _encode_infilling(self, s: str) -> List[int]: | ||
"""Encode a string without an implicit leading space.""" | ||
return self.tokenizer("☺" + s, add_special_tokens=False)[2:] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,21 @@ | ||
from typing import List | ||
|
||
from mistral_common.tokens.tokenizers.sentencepiece import InstructTokenizerV1 | ||
from transformers_utils.tokenizer import AnyTokenizer | ||
from transformers_utils.tokenizers import MistralTokenizer | ||
|
||
from vllm.entrypoints.openai.fim.fim_encoder import (FIMEncoder, | ||
FIMEncoderManager) | ||
from vllm.entrypoints.openai.fim.fim_encoder import FIMEncoder | ||
|
||
|
||
@FIMEncoderManager.register_module("mistral") | ||
class MistralFIMEncoder(FIMEncoder): | ||
|
||
def __init__(self, tokenizer: AnyTokenizer): | ||
super().__init__(tokenizer) | ||
|
||
if not isinstance(tokenizer, MistralTokenizer): | ||
if not isinstance(tokenizer, MistralTokenizer) \ | ||
or isinstance(tokenizer.instruct, InstructTokenizerV1): | ||
raise ValueError( | ||
"tokenizer incompatible with 'mistral' FIM encoder") | ||
|
||
def encode_with_suffix(self, prompt: str, suffix: str) -> List[int]: | ||
return self.tokenizer.encode_with_suffix(prompt=prompt, suffix=suffix) | ||
def encode_with_suffix(self, prefix: str, suffix: str) -> List[int]: | ||
return self.tokenizer.encode_with_suffix(prefix=prefix, suffix=suffix) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters