Skip to content

Commit

Permalink
convert-hf : fix type of tokens after #3252
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre committed Mar 27, 2024
1 parent 8d2ac2c commit 2e6fd63
Showing 1 changed file with 4 additions and 6 deletions.
10 changes: 4 additions & 6 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _get_part_names(self):
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
Expand All @@ -243,8 +243,7 @@ def _set_vocab_gpt2(self):

for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
Expand All @@ -266,7 +265,7 @@ def _set_vocab_gpt2(self):
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
Expand All @@ -291,8 +290,7 @@ def _set_vocab_qwen(self):

for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
Expand Down

0 comments on commit 2e6fd63

Please sign in to comment.