diff --git a/keras_nlp/models/albert/albert_preprocessor.py b/keras_nlp/models/albert/albert_preprocessor.py index 61d37c34d2..e34fe54536 100644 --- a/keras_nlp/models/albert/albert_preprocessor.py +++ b/keras_nlp/models/albert/albert_preprocessor.py @@ -16,6 +16,8 @@ from tensorflow import keras from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker +from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.utils.keras_utils import ( convert_inputs_to_list_of_tensor_segments, ) @@ -24,7 +26,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class AlbertPreprocessor(keras.layers.Layer): +class AlbertPreprocessor(Preprocessor): """An ALBERT preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: @@ -153,28 +155,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.AlbertTokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config and isinstance(config["tokenizer"], dict): - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -187,15 +177,5 @@ def call(self, x, y=None, sample_weight=None): return pack_x_y_sample_weight(x, y, sample_weight) @classproperty - def presets(cls): - return {} - - @classmethod - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - raise NotImplementedError + def tokenizer_cls(cls): + return AlbertTokenizer diff --git a/keras_nlp/models/bert/bert_preprocessor.py b/keras_nlp/models/bert/bert_preprocessor.py index f376c47f87..b1ce9c3293 100644 --- a/keras_nlp/models/bert/bert_preprocessor.py +++ b/keras_nlp/models/bert/bert_preprocessor.py @@ -21,6 +21,7 @@ from keras_nlp.models.bert.bert_presets import backbone_presets from keras_nlp.models.bert.bert_presets import classifier_presets from keras_nlp.models.bert.bert_tokenizer import BertTokenizer +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.utils.keras_utils import ( convert_inputs_to_list_of_tensor_segments, ) @@ -32,7 +33,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class BertPreprocessor(keras.layers.Layer): +class BertPreprocessor(Preprocessor): """A BERT preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: @@ -76,13 +77,8 @@ class BertPreprocessor(keras.layers.Layer): Examples: ```python - vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] - vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"] - vocab += ["Call", "me", "Ish", "##mael", "."] - vocab += ["Oh", "look", "a", "whale"] - vocab += ["I", "forgot", "my", "home", "##work"] - tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab) - preprocessor = keras_nlp.models.BertPreprocessor(tokenizer) + # Load the preprocessor from a preset. + preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased") # Tokenize and pack a single sentence. sentence = tf.constant("The quick brown fox jumped.") @@ -142,6 +138,16 @@ class BertPreprocessor(keras.layers.Layer): lambda s1, s2: preprocessor(x=(s1, s2)), num_parallel_calls=tf.data.AUTOTUNE, ) + + # Alternatively, you can create a preprocessor from your own vocabulary. + # The usage is exactly the same as shown above. + vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"] + vocab += ["Call", "me", "Ish", "##mael", "."] + vocab += ["Oh", "look", "a", "whale"] + vocab += ["I", "forgot", "my", "home", "##work"] + tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab) + preprocessor = keras_nlp.models.BertPreprocessor(tokenizer) ``` """ @@ -162,28 +168,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.BertTokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config and isinstance(config["tokenizer"], dict): - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -195,85 +189,22 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + @classproperty + def tokenizer_cls(cls): + return BertTokenizer + @classproperty def presets(cls): return copy.deepcopy({**backbone_presets, **classifier_presets}) @classmethod - @format_docstring(names=PRESET_NAMES) - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - """Instantiate BERT preprocessor from preset architecture. - - Args: - preset: string. Must be one of {{names}}. - sequence_length: int, optional. The length of the packed inputs. - Must be equal to or smaller than the `max_sequence_length` of - the preset. If left as default, the `max_sequence_length` of - the preset will be used. - truncate: string. The algorithm to truncate a list of batched - segments to fit within `sequence_length`. The value can be - either `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at - a time in a round-robin fashion to the inputs that still - need some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using - a "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we - run out of budget. It supports an arbitrary number of - segments. + def from_preset(cls, preset, **kwargs): + return super().from_preset(preset, **kwargs) - Examples: - ```python - # Load preprocessor from preset - preprocessor = keras_nlp.models.BertPreprocessor.from_preset( - "bert_base_en_uncased", - ) - preprocessor("The quick brown fox jumped.") - - # Override sequence_length - preprocessor = keras_nlp.models.BertPreprocessor.from_preset( - "bert_base_en_uncased", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = BertTokenizer.from_preset(preset) - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - metadata = cls.presets[preset] - if preset in backbone_presets: - backbone_config = metadata["config"] - else: - # For task model presets, the backbone config is nested. - backbone_config = metadata["config"]["backbone"]["config"] - max_sequence_length = backbone_config["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - truncate=truncate, - **kwargs, - ) +BertPreprocessor.from_preset.__func__.__doc__ = Preprocessor.from_preset.__doc__ +format_docstring( + preprocessor_name=BertPreprocessor.__name__, + example_preset_name="bert_base_en_uncased", + preset_names='", "'.join(BertPreprocessor.presets), +)(BertPreprocessor.from_preset.__func__) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py index 81645c9dbe..775b254fe3 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py @@ -28,9 +28,7 @@ from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import ( DebertaV3Preprocessor, ) -from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import ( - DebertaV3Tokenizer, -) +from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer class DebertaV3ClassifierTest(tf.test.TestCase, parameterized.TestCase): diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py index 649318ffdc..9050c20292 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py @@ -20,6 +20,7 @@ from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker from keras_nlp.models.deberta_v3.deberta_v3_presets import backbone_presets from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.utils.keras_utils import ( convert_inputs_to_list_of_tensor_segments, ) @@ -29,7 +30,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class DebertaV3Preprocessor(keras.layers.Layer): +class DebertaV3Preprocessor(Preprocessor): """A DeBERTa preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: @@ -73,11 +74,8 @@ class DebertaV3Preprocessor(keras.layers.Layer): Examples: ```python - tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm") - preprocessor = keras_nlp.models.DebertaV3Preprocessor( - tokenizer=tokenizer, - sequence_length=10, - ) + # Load the preprocessor from a preset. + preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset("deberta_v3_base_en") # Tokenize and pack a single sentence. sentence = tf.constant("The quick brown fox jumped.") @@ -137,6 +135,14 @@ class DebertaV3Preprocessor(keras.layers.Layer): lambda s1, s2: preprocessor(x=(s1, s2)), num_parallel_calls=tf.data.AUTOTUNE, ) + + # Alternatively, you can create a preprocessor from your own vocabulary. + # The usage is the exactly same as above. + tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm") + preprocessor = keras_nlp.models.DebertaV3Preprocessor( + tokenizer=tokenizer, + sequence_length=10, + ) ``` """ @@ -157,28 +163,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.DebertaV3Tokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config and isinstance(config["tokenizer"], dict): - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -189,80 +183,24 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + @classproperty + def tokenizer_cls(cls): + return DebertaV3Tokenizer + @classproperty def presets(cls): return copy.deepcopy(backbone_presets) @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - """Instantiate DeBERTa preprocessor from preset architecture. - - Args: - preset: string. Must be one of {{names}}. - sequence_length: int, optional. The length of the packed inputs. - Must be equal to or smaller than the `max_sequence_length` of - the preset. If left as default, the `max_sequence_length` of - the preset will be used. - truncate: string. The algorithm to truncate a list of batched - segments to fit within `sequence_length`. The value can be - either `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at - a time in a round-robin fashion to the inputs that still - need some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using - a "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we - run out of budget. It supports an arbitrary number of - segments. - - Examples: - ```python - # Load preprocessor from preset - preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset( - "deberta_v3_base_en", - ) - preprocessor("The quick brown fox jumped.") + def from_preset(cls, preset, **kwargs): + return super().from_preset(preset, **kwargs) - # Override sequence_length - preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset( - "deberta_v3_base_en", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = DebertaV3Tokenizer.from_preset(preset) - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - metadata = cls.presets[preset] - max_sequence_length = metadata["config"]["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - truncate=truncate, - **kwargs, - ) +DebertaV3Preprocessor.from_preset.__func__.__doc__ = ( + Preprocessor.from_preset.__doc__ +) +format_docstring( + preprocessor_name=DebertaV3Preprocessor.__name__, + example_preset_name="deberta_v3_base_en", + preset_names='", "'.join(DebertaV3Preprocessor.presets), +)(DebertaV3Preprocessor.from_preset.__func__) diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py index 75cc3cb845..6a190fda3b 100644 --- a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py +++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py @@ -22,6 +22,7 @@ from keras_nlp.models.distil_bert.distil_bert_tokenizer import ( DistilBertTokenizer, ) +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.utils.keras_utils import ( convert_inputs_to_list_of_tensor_segments, ) @@ -31,7 +32,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class DistilBertPreprocessor(keras.layers.Layer): +class DistilBertPreprocessor(Preprocessor): """A DistilBERT preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: @@ -75,13 +76,8 @@ class DistilBertPreprocessor(keras.layers.Layer): Examples: ```python - vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] - vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"] - vocab += ["Call", "me", "Ish", "##mael", "."] - vocab += ["Oh", "look", "a", "whale"] - vocab += ["I", "forgot", "my", "home", "##work"] - tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab) - preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer) + # Load the preprocessor from a preset. + preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset("distil_bert_base_en_uncased") # Tokenize and pack a single sentence. sentence = tf.constant("The quick brown fox jumped.") @@ -141,6 +137,16 @@ class DistilBertPreprocessor(keras.layers.Layer): lambda s1, s2: preprocessor(x=(s1, s2)), num_parallel_calls=tf.data.AUTOTUNE, ) + + # Alternatively, you can create a preprocessor from your own vocabulary. + # The usage is exactly the same as above. + vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] + vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"] + vocab += ["Call", "me", "Ish", "##mael", "."] + vocab += ["Oh", "look", "a", "whale"] + vocab += ["I", "forgot", "my", "home", "##work"] + tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab) + preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer) ``` """ @@ -161,28 +167,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.DistilBertTokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config: - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -193,80 +187,24 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + @classproperty + def tokenizer_cls(cls): + return DistilBertTokenizer + @classproperty def presets(cls): return copy.deepcopy(backbone_presets) @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - """Instantiate DistilBERT preprocessor from preset architecture. - - Args: - preset: string. Must be one of {{names}}. - sequence_length: int, optional. The length of the packed inputs. - Must be equal to or smaller than the `max_sequence_length` of - the preset. If left as default, the `max_sequence_length` of - the preset will be used. - truncate: string. The algorithm to truncate a list of batched - segments to fit within `sequence_length`. The value can be - either `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at - a time in a round-robin fashion to the inputs that still - need some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using - a "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we - run out of budget. It supports an arbitrary number of - segments. - - Examples: - ```python - # Load preprocessor from preset - preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset( - "distil_bert_base_en_uncased", - ) - preprocessor("The quick brown fox jumped.") - - # Override sequence_length - preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset( - "distil_bert_base_en_uncased", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = DistilBertTokenizer.from_preset(preset) + def from_preset(cls, preset, **kwargs): + return super().from_preset(preset, **kwargs) - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - metadata = cls.presets[preset] - max_sequence_length = metadata["config"]["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - truncate=truncate, - **kwargs, - ) +DistilBertPreprocessor.from_preset.__func__.__doc__ = ( + Preprocessor.from_preset.__doc__ +) +format_docstring( + preprocessor_name=DistilBertPreprocessor.__name__, + example_preset_name="distil_bert_base_en_uncased", + preset_names='", "'.join(DistilBertPreprocessor.presets), +)(DistilBertPreprocessor.from_preset.__func__) diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py index ca13f75177..c14cd3d163 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py @@ -19,7 +19,7 @@ from absl.testing import parameterized from tensorflow import keras -from keras_nlp.models.distil_bert.distil_bert_preprocessor import ( +from keras_nlp.models.distil_bert.distil_bert_tokenizer import ( DistilBertTokenizer, ) diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py new file mode 100644 index 0000000000..ec99ccc61f --- /dev/null +++ b/keras_nlp/models/preprocessor.py @@ -0,0 +1,112 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tensorflow import keras + +from keras_nlp.utils.python_utils import classproperty + + +@keras.utils.register_keras_serializable(package="keras_nlp") +class Preprocessor(keras.layers.Layer): + """Base class for model preprocessors.""" + + @property + def tokenizer(self): + """The tokenizer used to tokenize strings.""" + return self._tokenizer + + def get_config(self): + config = super().get_config() + config["tokenizer"] = keras.layers.serialize(self.tokenizer) + return config + + @classmethod + def from_config(cls, config): + if "tokenizer" in config and isinstance(config["tokenizer"], dict): + config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) + return cls(**config) + + @classproperty + def tokenizer_cls(cls): + return None + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + **kwargs, + ): + """Instantiate {{preprocessor_name}} from preset architecture. + + Args: + preset: string. Must be one of "{{preset_names}}". + + Examples: + ```python + # Load preprocessor from preset + preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( + "{{example_preset_name}}", + ) + preprocessor("The quick brown fox jumped.") + + # Override sequence_length + preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( + "{{example_preset_name}}", + sequence_length=64 + ) + preprocessor("The quick brown fox jumped.") + ``` + """ + if not cls.presets: + raise NotImplementedError( + "No presets have been created for this class." + ) + if preset not in cls.presets: + raise ValueError( + "`preset` must be one of " + f"""{", ".join(cls.presets)}. Received: {preset}.""" + ) + + tokenizer = cls.tokenizer_cls.from_preset(preset) + + metadata = cls.presets[preset] + # For task model presets, the backbone config is nested. + if "backbone" in metadata["config"]: + backbone_config = metadata["config"]["backbone"]["config"] + else: + backbone_config = metadata["config"] + + # Use model's `max_sequence_length` if `sequence_length` unspecified; + # otherwise check that `sequence_length` not too long. + sequence_length = kwargs.pop("sequence_length", None) + max_sequence_length = backbone_config["max_sequence_length"] + if sequence_length is not None: + if sequence_length > max_sequence_length: + raise ValueError( + f"`sequence_length` cannot be longer than `{preset}` " + f"preset's `max_sequence_length` of {max_sequence_length}. " + f"Received: {sequence_length}." + ) + else: + sequence_length = max_sequence_length + + return cls( + tokenizer=tokenizer, + sequence_length=sequence_length, + **kwargs, + ) diff --git a/keras_nlp/models/roberta/roberta_preprocessor.py b/keras_nlp/models/roberta/roberta_preprocessor.py index 8eaa1595e0..ac28f49c2d 100644 --- a/keras_nlp/models/roberta/roberta_preprocessor.py +++ b/keras_nlp/models/roberta/roberta_preprocessor.py @@ -20,6 +20,7 @@ import tensorflow_text as tf_text from tensorflow import keras +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.models.roberta.roberta_presets import backbone_presets from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer from keras_nlp.utils.keras_utils import ( @@ -31,7 +32,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class RobertaPreprocessor(keras.layers.Layer): +class RobertaPreprocessor(Preprocessor): """RoBERTa preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: @@ -77,32 +78,8 @@ class RobertaPreprocessor(keras.layers.Layer): Examples: ```python - vocab = { - "": 0, - "": 1, - "": 2, - "reful": 3, - "gent": 4, - "Ġafter": 5, - "noon": 6, - "Ġsun": 7, - "Ġbright": 8, - "Ġnight": 9, - "Ġmoon": 10, - } - merges = ["Ġ a", "Ġ m", "Ġ s", "Ġ b", "Ġ n", "r e", "f u", "g e", "n t"] - merges += ["e r", "n o", "o n", "i g", "h t"] - merges += ["Ġs u", "Ġa f", "Ġm o", "Ġb r","ge nt", "no on", "re fu", "ig ht"] - merges += ["Ġn ight", "Ġsu n", "Ġaf t", "Ġmo on", "Ġbr ight", "refu l", "Ġaft er"] - - tokenizer = keras_nlp.models.RobertaTokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.RobertaPreprocessor( - tokenizer=tokenizer, - sequence_length=20, - ) + # Load the preprocessor from a preset. + preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en") # Tokenize and pack a single sentence. sentence = tf.constant(" afternoon sun") @@ -152,6 +129,28 @@ class RobertaPreprocessor(keras.layers.Layer): lambda s1, s2: preprocessor(x=(s1, s2)), num_parallel_calls=tf.data.AUTOTUNE, ) + + # Alternatively, you can create a preprocessor from your own vocabulary. + # The usage is exactly the same as above. + vocab = { + "": 0, + "": 1, + "": 2, + "Ġafter": 5, + "noon": 6, + "Ġsun": 7, + } + merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"] + merges += ["Ġsu n", "Ġaf t", "Ġaft er"] + + tokenizer = keras_nlp.models.RobertaTokenizer( + vocabulary=vocab, + merges=merges, + ) + preprocessor = keras_nlp.models.RobertaPreprocessor( + tokenizer=tokenizer, + sequence_length=20, + ) ``` """ @@ -165,7 +164,6 @@ def __init__( super().__init__(**kwargs) self._tokenizer = tokenizer - self.packer = RobertaMultiSegmentPacker( start_value=self.tokenizer.start_token_id, end_value=self.tokenizer.end_token_id, @@ -174,28 +172,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.RobertaTokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config and isinstance(config["tokenizer"], dict): - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -206,83 +192,27 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + @classproperty + def tokenizer_cls(cls): + return RobertaTokenizer + @classproperty def presets(cls): return copy.deepcopy(backbone_presets) @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - """Instantiate RoBERTa preprocessor from preset architecture. - - Args: - preset: string. Must be one of {{names}}. - sequence_length: int, optional. The length of the packed inputs. - Must be equal to or smaller than the `max_sequence_length` of - the preset. If left as default, the `max_sequence_length` of - the preset will be used. - truncate: string. The algorithm to truncate a list of batched - segments to fit within `sequence_length`. The value can be - either `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at - a time in a round-robin fashion to the inputs that still - need some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using - a "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we - run out of budget. It supports an arbitrary number of - segments. - - Examples: - ```python - # Load preprocessor from preset - preprocessor = keras_nlp.models.RobertPreprocessor.from_preset( - "roberta_base_en", - ) - preprocessor("The quick brown fox jumped.") + def from_preset(cls, preset, **kwargs): + return super().from_preset(preset, **kwargs) - # Override sequence_length - preprocessor = keras_nlp.models.BertPreprocessor.from_preset( - "roberta_base_en", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - tokenizer = RobertaTokenizer.from_preset(preset) - - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - metadata = cls.presets[preset] - max_sequence_length = metadata["config"]["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - truncate=truncate, - **kwargs, - ) +RobertaPreprocessor.from_preset.__func__.__doc__ = ( + Preprocessor.from_preset.__doc__ +) +format_docstring( + preprocessor_name=RobertaPreprocessor.__name__, + example_preset_name="roberta_base_en", + preset_names='", "'.join(RobertaPreprocessor.presets), +)(RobertaPreprocessor.from_preset.__func__) # TODO: This is a temporary, unexported layer until we find a way to make the diff --git a/keras_nlp/models/roberta/roberta_preprocessor_test.py b/keras_nlp/models/roberta/roberta_preprocessor_test.py index d4c6653e20..2045408547 100644 --- a/keras_nlp/models/roberta/roberta_preprocessor_test.py +++ b/keras_nlp/models/roberta/roberta_preprocessor_test.py @@ -21,7 +21,7 @@ from tensorflow import keras from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor -from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer +from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer class RobertaPreprocessorTest(tf.test.TestCase, parameterized.TestCase): diff --git a/keras_nlp/models/roberta/roberta_tokenizer_test.py b/keras_nlp/models/roberta/roberta_tokenizer_test.py index 558ca100fb..0d09b48451 100644 --- a/keras_nlp/models/roberta/roberta_tokenizer_test.py +++ b/keras_nlp/models/roberta/roberta_tokenizer_test.py @@ -20,7 +20,7 @@ from absl.testing import parameterized from tensorflow import keras -from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer +from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer class RobertaTokenizerTest(tf.test.TestCase, parameterized.TestCase): diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py index 17e30618bd..eaf83147a3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py @@ -28,7 +28,7 @@ from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( XLMRobertaPreprocessor, ) -from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( +from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( XLMRobertaTokenizer, ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index ff9ef35914..a21bddae10 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -18,6 +18,7 @@ from tensorflow import keras +from keras_nlp.models.preprocessor import Preprocessor from keras_nlp.models.roberta.roberta_preprocessor import ( RobertaMultiSegmentPacker, ) @@ -34,7 +35,7 @@ @keras.utils.register_keras_serializable(package="keras_nlp") -class XLMRobertaPreprocessor(keras.layers.Layer): +class XLMRobertaPreprocessor(Preprocessor): """XLM-RoBERTa preprocessing layer. This preprocessing layer will do three things: @@ -80,11 +81,8 @@ class XLMRobertaPreprocessor(keras.layers.Layer): Examples: ```python - tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm") - preprocessor = keras_nlp.models.XLMRobertaPreprocessor( - tokenizer=tokenizer, - sequence_length=10, - ) + # Load the preprocessor from a preset. + preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi") # Tokenize and pack a single sentence. sentence = tf.constant("The quick brown fox jumped.") @@ -144,6 +142,14 @@ class XLMRobertaPreprocessor(keras.layers.Layer): lambda s1, s2: preprocessor(x=(s1, s2)), num_parallel_calls=tf.data.AUTOTUNE, ) + + # Alternatively, you can create a preprocessor from your own vocabulary. + # The usage is exactly the same as above. + tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm") + preprocessor = keras_nlp.models.XLMRobertaPreprocessor( + tokenizer=tokenizer, + sequence_length=10, + ) ``` """ @@ -166,28 +172,16 @@ def __init__( sequence_length=sequence_length, ) - @property - def tokenizer(self): - """The `keras_nlp.models.XLMRobertaTokenizer` used to tokenize strings.""" - return self._tokenizer - def get_config(self): config = super().get_config() config.update( { - "tokenizer": keras.layers.serialize(self.tokenizer), "sequence_length": self.packer.sequence_length, "truncate": self.packer.truncate, } ) return config - @classmethod - def from_config(cls, config): - if "tokenizer" in config: - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - return cls(**config) - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -198,80 +192,24 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + @classproperty + def tokenizer_cls(cls): + return XLMRobertaTokenizer + @classproperty def presets(cls): return copy.deepcopy(backbone_presets) @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - sequence_length=None, - truncate="round_robin", - **kwargs, - ): - """Instantiate XLM-RoBERTa preprocessor from preset architecture. - - Args: - preset: string. Must be one of {{names}}. - sequence_length: int, optional. The length of the packed inputs. - Must be equal to or smaller than the `max_sequence_length` of - the preset. If left as default, the `max_sequence_length` of - the preset will be used. - truncate: string. The algorithm to truncate a list of batched - segments to fit within `sequence_length`. The value can be - either `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at - a time in a round-robin fashion to the inputs that still - need some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using - a "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we - run out of budget. It supports an arbitrary number of - segments. - - Examples: - ```python - # Load preprocessor from preset - preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( - "xlm_roberta_base_multi", - ) - preprocessor("The quick brown fox jumped.") + def from_preset(cls, preset, **kwargs): + return super().from_preset(preset, **kwargs) - # Override sequence_length - preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( - "xlm_roberta_base_multi", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = XLMRobertaTokenizer.from_preset(preset) - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - metadata = cls.presets[preset] - max_sequence_length = metadata["config"]["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - truncate=truncate, - **kwargs, - ) +XLMRobertaPreprocessor.from_preset.__func__.__doc__ = ( + Preprocessor.from_preset.__doc__ +) +format_docstring( + preprocessor_name=XLMRobertaPreprocessor.__name__, + example_preset_name="xlm_roberta_base_multi", + preset_names='", "'.join(XLMRobertaPreprocessor.presets), +)(XLMRobertaPreprocessor.from_preset.__func__) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py index 279d43cdcb..8074aa1d8d 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py @@ -25,7 +25,7 @@ from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( XLMRobertaPreprocessor, ) -from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( +from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( XLMRobertaTokenizer, ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py index 8d1c6c9213..095f92060e 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py @@ -22,7 +22,7 @@ from absl.testing import parameterized from tensorflow import keras -from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( +from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( XLMRobertaTokenizer, )