From 9b5907ead7ec2389747ed8dcb6daa6a04c4775c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Thu, 12 Oct 2023 02:53:09 +0900 Subject: [PATCH 1/4] Override tokens if their ids in added tokens are below the vocab_base size Mistral Orca has added tokens that the ids are below 30,000. It causes some issues when converting models. --- convert.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/convert.py b/convert.py index e9b08d344f5bd..e8d50b0620420 100755 --- a/convert.py +++ b/convert.py @@ -359,51 +359,62 @@ def __repr__(self) -> str: class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + added_tokens: dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + tokens_to_replace: dict[int, str] = {} + new_tokens: dict[int, str] = {} + for piece, idx in items: + if idx < vocab_size: + tokens_to_replace[idx] = piece + else: + new_tokens[idx] = piece - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] + expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise Exception(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + self.tokens_to_replace = tokens_to_replace + self.new_tokens_list = [new_tokens[id] for id in actual_new_ids] self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size: int = self.vocab_size_base + len(self.new_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) + for id in range(tokenizer.vocab_size()): + piece = tokenizer.id_to_piece(id) if id not in self.tokens_to_replace else self.tokens_to_replace[id] text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) + score: float = tokenizer.get_score(id) toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): + if tokenizer.is_unknown(id): toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): + if tokenizer.is_control(id): toktype = gguf.TokenType.CONTROL # NOTE: I think added_tokens are user defined. # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - if tokenizer.is_unused(i): + if tokenizer.is_unused(id): toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): + if tokenizer.is_byte(id): toktype = gguf.TokenType.BYTE yield text, score, toktype def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: + for text in self.new_tokens_list: score = -1000.0 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED From 576df7770aad27a93caaf6d28a70d16c02e6d5d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Thu, 12 Oct 2023 03:03:37 +0900 Subject: [PATCH 2/4] beautify --- convert.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/convert.py b/convert.py index e8d50b0620420..19840a09e62f8 100755 --- a/convert.py +++ b/convert.py @@ -366,7 +366,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + items: list[tuple[str, int]] = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) tokens_to_replace: dict[int, str] = {} new_tokens: dict[int, str] = {} @@ -376,18 +376,20 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No else: new_tokens[idx] = piece - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids: list[int] = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: raise Exception(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - self.tokens_to_replace = tokens_to_replace - self.new_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.new_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + # Key is the original token ID, value is the replacement token piece. + self.tokens_to_replace = tokens_to_replace + # Token pieces that were added to the base vocabulary. + self.new_tokens_list: list[str] = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.new_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer From e6ea63ca7190ae39fa03f8468b4a8cfe51b0aeef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Thu, 12 Oct 2023 12:44:33 +0900 Subject: [PATCH 3/4] simply ignore added tokens that id < vocab size --- convert.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/convert.py b/convert.py index 19840a09e62f8..e9fe25b784401 100755 --- a/convert.py +++ b/convert.py @@ -366,24 +366,14 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} - items: list[tuple[str, int]] = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - - tokens_to_replace: dict[int, str] = {} - new_tokens: dict[int, str] = {} - for piece, idx in items: - if idx < vocab_size: - tokens_to_replace[idx] = piece - else: - new_tokens[idx] = piece + new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens))) actual_new_ids: list[int] = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: raise Exception(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - # Key is the original token ID, value is the replacement token piece. - self.tokens_to_replace = tokens_to_replace # Token pieces that were added to the base vocabulary. self.new_tokens_list: list[str] = [new_tokens[id] for id in actual_new_ids] self.vocab_size_base: int = vocab_size @@ -394,7 +384,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for id in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(id) if id not in self.tokens_to_replace else self.tokens_to_replace[id] + piece = tokenizer.id_to_piece(id) text: bytes = piece.encode("utf-8") score: float = tokenizer.get_score(id) From ff12b8fbd629bfa55e6b8457827e35a88ff2d755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Sun, 15 Oct 2023 21:30:09 +0900 Subject: [PATCH 4/4] move back vocab_size where it was --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index e9fe25b784401..96b6c6f847e1c 100755 --- a/convert.py +++ b/convert.py @@ -359,14 +359,14 @@ def __repr__(self) -> str: class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - added_tokens: dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens))) actual_new_ids: list[int] = sorted(new_tokens.keys())