Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FalconTokenizer #1485

Merged
merged 6 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions keras_nlp/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
)
from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor
from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer
from keras_nlp.models.falcon.falcon_backbone import FalconBackbone
from keras_nlp.models.falcon.falcon_tokenizer import FalconTokenizer
from keras_nlp.models.gemma.gemma_backbone import GemmaBackbone
from keras_nlp.models.gemma.gemma_causal_lm import GemmaCausalLM
from keras_nlp.models.gemma.gemma_causal_lm_preprocessor import (
Expand Down
30 changes: 30 additions & 0 deletions keras_nlp/models/falcon/falcon_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2024 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon model preset configurations."""

backbone_presets = {
"falcon_refinedweb_1b_en": {
"metadata": {
"description": (
"24-layer Falcon model (Falcon with 1B parameters), trained on "
"350B tokens of RefinedWeb dataset."
),
"params": 1311625216,
"official_name": "Falcon",
"path": "falcon",
"model_card": "https://huggingface.co/tiiuae/falcon-rw-1b",
},
"kaggle_handle": "kaggle://keras/falcon/keras/falcon_refinedweb_1b_en/1",
},
}
117 changes: 117 additions & 0 deletions keras_nlp/models/falcon/falcon_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright 2024 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.models.falcon.falcon_presets import backbone_presets
from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer
from keras_nlp.utils.python_utils import classproperty


@keras_nlp_export("keras_nlp.models.FalconTokenizer")
class FalconTokenizer(BytePairTokenizer):
"""Falcon tokenizer based on BytePairTokenizer.

This tokenizer class will tokenize raw strings into integer sequences and
is based on `keras_nlp.tokenizers.BytePairTokenizer`. Unlike the
underlying tokenizer, it will check for all special tokens needed by Falcon
models and provides a `from_preset()` method to automatically download
a matching vocabulary for a Falcon preset.

This tokenizer does not provide truncation or padding of inputs.

If input is a batch of strings (rank > 0), the layer will output a
`tf.RaggedTensor` where the last dimension of the output is ragged.

If input is a scalar string (rank == 0), the layer will output a dense
`tf.Tensor` with static shape `[None]`.

Args:
vocabulary: string or dict, maps token to integer ids. If it is a
string, it should be the file path to a json file.
merges: string or list, contains the merge rule. If it is a string,
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.

Examples:

```python
# Unbatched input.
tokenizer = keras_nlp.models.FalconTokenizer.from_preset("falcon_refinedweb_1b_en")
tokenizer("The quick brown fox jumped.")

# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])

# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# Custom vocabulary.
vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6}
merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
merges += ["Ġ f", "o x", "Ġf ox"]
tokenizer = keras_nlp.models.FalconTokenizer(vocabulary=vocab, merges=merges)
tokenizer("a quick fox.")
```
"""

def __init__(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General question, but what is up with the 7b tokenizer? It looks like it is still basically just BPE, but with extra special tokens? https://huggingface.co/tiiuae/falcon-7b-instruct/raw/main/tokenizer.json

Maybe we can pull the vocab and merges out of this json, so we can handle them normally, and tackle the rest of the weirdness in code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's not just special token differences. They have different vocab sizes: 1b has 50256 tokens in total while 7b has 65023 (there are only 10 extra special tokens in 7b).

Copy link
Member

@mattdangerw mattdangerw Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, new tokenizer vocab for sure, but I was hoping we could avoid a whole new file format.

It seems like they are still fundamentally BPE, with a different vocab and more special tokens right? If we can still save this as a tokenizer.json + assets/tokenizer/merges.txt + assets/tokenizer/vocab.json that seems ideal to me. But we could also write a custom loader for Falcon's bespoke tokenizer.json format if we think that's better.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I misunderstood what you said.

Do you mean that since they are basically using BPE, we can skip creating FalconTokenizer and use BPE directly?

I was hoping we could avoid a whole new file format.

Could you explain what you mean by "a new file format"? What's the new file format I'm creating? :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry! I am being unclear. Everything you have looks good for the 1b model. I am asking about/trying to think through an upcoming problem with the 7b falcon models.

Take a look at:

The 7b tokenizer assets are different, there is no merges.txt or vocab.json. There is just one weird tokenizer.json that combines the two. We don't have any code that will allow reading that bespoke tokenizer.json today, so we could either extrac a merges and vocab from it, support loading it directly with new parsing code, or something else.

Does that clarify or not really?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And to be clear, we should have a FalconTokenizer for sure. The question I have is just whether it can be a "simple subclass" of the BytePairEncoding tokenizer, or whether we need custom json parsing code after we also convert the 7b models.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see! Thanks for the clarification!

I agree that it would be better to just extract vocab and merges from their format and load it like other models as there isn't any other model that has this format.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if I understand what you arguing right but I believe it's about special tokens that the 7 billion model have and how we can save them properly.
If that is the case so we can add special_tokens arg for FalconTokenizer, something like WhisperTokenizer, but instead it can be only a list because they are already included in the vocabulary, and while initialization, we pass them with <|endoftext|> in usplittable_tokens arg while initializing the super class .
And for the conversion script, while converting the tokenizer, we can check for hf_tokenizer["added_tokens"] list if it has any added tokens other than the <|endoftext|> token and pass them to the FalconTokenizer as special_tokens. and then of course we need to update the config to contain the special tokens. So we will have only tokenizer.json + assets/tokenizer/merges.txt + assets/tokenizer/vocab.json, but with the config in tokenizer.json having a list of special_tokens.

self,
vocabulary=None,
merges=None,
**kwargs,
):
# Falcon uses the same start as end token, i.e., "<|endoftext|>".
self.end_token = self.start_token = "<|endoftext|>"

super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[self.end_token],
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
if self.end_token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{self.end_token}'` in the provided "
f"`vocabulary`. Please provide `'{self.end_token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.end_token_id = self.token_to_id(self.end_token)
self.start_token_id = self.end_token_id
self.pad_token_id = 0
else:
self.end_token_id = None
self.start_token_id = None
self.pad_token_id = None

@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
return config
62 changes: 62 additions & 0 deletions keras_nlp/models/falcon/falcon_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2024 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest

from keras_nlp.models.falcon.falcon_tokenizer import FalconTokenizer
from keras_nlp.tests.test_case import TestCase


class FalconTokenizerTest(TestCase):
def setUp(self):
self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
self.vocab += ["<|endoftext|>"]
self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.input_data = [
" airplane at airport<|endoftext|>",
" airplane airport",
]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=FalconTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]],
)

def test_errors_missing_special_tokens(self):
with self.assertRaises(ValueError):
FalconTokenizer(vocabulary=["a", "b", "c"], merges=[])

@pytest.mark.large
def test_smallest_preset(self):
self.run_preset_test(
cls=FalconTokenizer,
preset="falcon_refinedweb_1b_en",
input_data=["The quick brown fox."],
expected_output=[[464, 2068, 7586, 21831, 13]],
)

@pytest.mark.extra_large
def test_all_presets(self):
for preset in FalconTokenizer.presets:
self.run_preset_test(
cls=FalconTokenizer,
preset=preset,
input_data=self.input_data,
)
Loading
Loading