diff --git a/keras_nlp/models/albert/albert_preprocessor.py b/keras_nlp/models/albert/albert_preprocessor.py
index 61d37c34d2..e34fe54536 100644
--- a/keras_nlp/models/albert/albert_preprocessor.py
+++ b/keras_nlp/models/albert/albert_preprocessor.py
@@ -16,6 +16,8 @@
from tensorflow import keras
from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
+from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
@@ -24,7 +26,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class AlbertPreprocessor(keras.layers.Layer):
+class AlbertPreprocessor(Preprocessor):
"""An ALBERT preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
@@ -153,28 +155,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.AlbertTokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config and isinstance(config["tokenizer"], dict):
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -187,15 +177,5 @@ def call(self, x, y=None, sample_weight=None):
return pack_x_y_sample_weight(x, y, sample_weight)
@classproperty
- def presets(cls):
- return {}
-
- @classmethod
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- raise NotImplementedError
+ def tokenizer_cls(cls):
+ return AlbertTokenizer
diff --git a/keras_nlp/models/bert/bert_preprocessor.py b/keras_nlp/models/bert/bert_preprocessor.py
index f376c47f87..b1ce9c3293 100644
--- a/keras_nlp/models/bert/bert_preprocessor.py
+++ b/keras_nlp/models/bert/bert_preprocessor.py
@@ -21,6 +21,7 @@
from keras_nlp.models.bert.bert_presets import backbone_presets
from keras_nlp.models.bert.bert_presets import classifier_presets
from keras_nlp.models.bert.bert_tokenizer import BertTokenizer
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
@@ -32,7 +33,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class BertPreprocessor(keras.layers.Layer):
+class BertPreprocessor(Preprocessor):
"""A BERT preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
@@ -76,13 +77,8 @@ class BertPreprocessor(keras.layers.Layer):
Examples:
```python
- vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
- vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
- vocab += ["Call", "me", "Ish", "##mael", "."]
- vocab += ["Oh", "look", "a", "whale"]
- vocab += ["I", "forgot", "my", "home", "##work"]
- tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
- preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
+ # Load the preprocessor from a preset.
+ preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased")
# Tokenize and pack a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
@@ -142,6 +138,16 @@ class BertPreprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)
+
+ # Alternatively, you can create a preprocessor from your own vocabulary.
+ # The usage is exactly the same as shown above.
+ vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
+ vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
+ vocab += ["Call", "me", "Ish", "##mael", "."]
+ vocab += ["Oh", "look", "a", "whale"]
+ vocab += ["I", "forgot", "my", "home", "##work"]
+ tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
+ preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
```
"""
@@ -162,28 +168,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.BertTokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config and isinstance(config["tokenizer"], dict):
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -195,85 +189,22 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)
+ @classproperty
+ def tokenizer_cls(cls):
+ return BertTokenizer
+
@classproperty
def presets(cls):
return copy.deepcopy({**backbone_presets, **classifier_presets})
@classmethod
- @format_docstring(names=PRESET_NAMES)
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- """Instantiate BERT preprocessor from preset architecture.
-
- Args:
- preset: string. Must be one of {{names}}.
- sequence_length: int, optional. The length of the packed inputs.
- Must be equal to or smaller than the `max_sequence_length` of
- the preset. If left as default, the `max_sequence_length` of
- the preset will be used.
- truncate: string. The algorithm to truncate a list of batched
- segments to fit within `sequence_length`. The value can be
- either `round_robin` or `waterfall`:
- - `"round_robin"`: Available space is assigned one token at
- a time in a round-robin fashion to the inputs that still
- need some, until the limit is reached.
- - `"waterfall"`: The allocation of the budget is done using
- a "waterfall" algorithm that allocates quota in a
- left-to-right manner and fills up the buckets until we
- run out of budget. It supports an arbitrary number of
- segments.
+ def from_preset(cls, preset, **kwargs):
+ return super().from_preset(preset, **kwargs)
- Examples:
- ```python
- # Load preprocessor from preset
- preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
- "bert_base_en_uncased",
- )
- preprocessor("The quick brown fox jumped.")
-
- # Override sequence_length
- preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
- "bert_base_en_uncased",
- sequence_length=64
- )
- preprocessor("The quick brown fox jumped.")
- ```
- """
- if preset not in cls.presets:
- raise ValueError(
- "`preset` must be one of "
- f"""{", ".join(cls.presets)}. Received: {preset}."""
- )
-
- tokenizer = BertTokenizer.from_preset(preset)
- # Use model's `max_sequence_length` if `sequence_length` unspecified;
- # otherwise check that `sequence_length` not too long.
- metadata = cls.presets[preset]
- if preset in backbone_presets:
- backbone_config = metadata["config"]
- else:
- # For task model presets, the backbone config is nested.
- backbone_config = metadata["config"]["backbone"]["config"]
- max_sequence_length = backbone_config["max_sequence_length"]
- if sequence_length is not None:
- if sequence_length > max_sequence_length:
- raise ValueError(
- f"`sequence_length` cannot be longer than `{preset}` "
- f"preset's `max_sequence_length` of {max_sequence_length}. "
- f"Received: {sequence_length}."
- )
- else:
- sequence_length = max_sequence_length
-
- return cls(
- tokenizer=tokenizer,
- sequence_length=sequence_length,
- truncate=truncate,
- **kwargs,
- )
+BertPreprocessor.from_preset.__func__.__doc__ = Preprocessor.from_preset.__doc__
+format_docstring(
+ preprocessor_name=BertPreprocessor.__name__,
+ example_preset_name="bert_base_en_uncased",
+ preset_names='", "'.join(BertPreprocessor.presets),
+)(BertPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
index 81645c9dbe..775b254fe3 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
@@ -28,9 +28,7 @@
from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
DebertaV3Preprocessor,
)
-from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
- DebertaV3Tokenizer,
-)
+from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
class DebertaV3ClassifierTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
index 649318ffdc..9050c20292 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
@@ -20,6 +20,7 @@
from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
from keras_nlp.models.deberta_v3.deberta_v3_presets import backbone_presets
from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
@@ -29,7 +30,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class DebertaV3Preprocessor(keras.layers.Layer):
+class DebertaV3Preprocessor(Preprocessor):
"""A DeBERTa preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
@@ -73,11 +74,8 @@ class DebertaV3Preprocessor(keras.layers.Layer):
Examples:
```python
- tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
- preprocessor = keras_nlp.models.DebertaV3Preprocessor(
- tokenizer=tokenizer,
- sequence_length=10,
- )
+ # Load the preprocessor from a preset.
+ preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset("deberta_v3_base_en")
# Tokenize and pack a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
@@ -137,6 +135,14 @@ class DebertaV3Preprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)
+
+ # Alternatively, you can create a preprocessor from your own vocabulary.
+ # The usage is the exactly same as above.
+ tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
+ preprocessor = keras_nlp.models.DebertaV3Preprocessor(
+ tokenizer=tokenizer,
+ sequence_length=10,
+ )
```
"""
@@ -157,28 +163,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.DebertaV3Tokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config and isinstance(config["tokenizer"], dict):
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -189,80 +183,24 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)
+ @classproperty
+ def tokenizer_cls(cls):
+ return DebertaV3Tokenizer
+
@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)
@classmethod
- @format_docstring(names=", ".join(backbone_presets))
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- """Instantiate DeBERTa preprocessor from preset architecture.
-
- Args:
- preset: string. Must be one of {{names}}.
- sequence_length: int, optional. The length of the packed inputs.
- Must be equal to or smaller than the `max_sequence_length` of
- the preset. If left as default, the `max_sequence_length` of
- the preset will be used.
- truncate: string. The algorithm to truncate a list of batched
- segments to fit within `sequence_length`. The value can be
- either `round_robin` or `waterfall`:
- - `"round_robin"`: Available space is assigned one token at
- a time in a round-robin fashion to the inputs that still
- need some, until the limit is reached.
- - `"waterfall"`: The allocation of the budget is done using
- a "waterfall" algorithm that allocates quota in a
- left-to-right manner and fills up the buckets until we
- run out of budget. It supports an arbitrary number of
- segments.
-
- Examples:
- ```python
- # Load preprocessor from preset
- preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
- "deberta_v3_base_en",
- )
- preprocessor("The quick brown fox jumped.")
+ def from_preset(cls, preset, **kwargs):
+ return super().from_preset(preset, **kwargs)
- # Override sequence_length
- preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
- "deberta_v3_base_en",
- sequence_length=64
- )
- preprocessor("The quick brown fox jumped.")
- ```
- """
- if preset not in cls.presets:
- raise ValueError(
- "`preset` must be one of "
- f"""{", ".join(cls.presets)}. Received: {preset}."""
- )
-
- tokenizer = DebertaV3Tokenizer.from_preset(preset)
- # Use model's `max_sequence_length` if `sequence_length` unspecified;
- # otherwise check that `sequence_length` not too long.
- metadata = cls.presets[preset]
- max_sequence_length = metadata["config"]["max_sequence_length"]
- if sequence_length is not None:
- if sequence_length > max_sequence_length:
- raise ValueError(
- f"`sequence_length` cannot be longer than `{preset}` "
- f"preset's `max_sequence_length` of {max_sequence_length}. "
- f"Received: {sequence_length}."
- )
- else:
- sequence_length = max_sequence_length
-
- return cls(
- tokenizer=tokenizer,
- sequence_length=sequence_length,
- truncate=truncate,
- **kwargs,
- )
+DebertaV3Preprocessor.from_preset.__func__.__doc__ = (
+ Preprocessor.from_preset.__doc__
+)
+format_docstring(
+ preprocessor_name=DebertaV3Preprocessor.__name__,
+ example_preset_name="deberta_v3_base_en",
+ preset_names='", "'.join(DebertaV3Preprocessor.presets),
+)(DebertaV3Preprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
index 75cc3cb845..6a190fda3b 100644
--- a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
+++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
@@ -22,6 +22,7 @@
from keras_nlp.models.distil_bert.distil_bert_tokenizer import (
DistilBertTokenizer,
)
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
@@ -31,7 +32,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class DistilBertPreprocessor(keras.layers.Layer):
+class DistilBertPreprocessor(Preprocessor):
"""A DistilBERT preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
@@ -75,13 +76,8 @@ class DistilBertPreprocessor(keras.layers.Layer):
Examples:
```python
- vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
- vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
- vocab += ["Call", "me", "Ish", "##mael", "."]
- vocab += ["Oh", "look", "a", "whale"]
- vocab += ["I", "forgot", "my", "home", "##work"]
- tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
- preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
+ # Load the preprocessor from a preset.
+ preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset("distil_bert_base_en_uncased")
# Tokenize and pack a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
@@ -141,6 +137,16 @@ class DistilBertPreprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)
+
+ # Alternatively, you can create a preprocessor from your own vocabulary.
+ # The usage is exactly the same as above.
+ vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
+ vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
+ vocab += ["Call", "me", "Ish", "##mael", "."]
+ vocab += ["Oh", "look", "a", "whale"]
+ vocab += ["I", "forgot", "my", "home", "##work"]
+ tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
+ preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
```
"""
@@ -161,28 +167,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.DistilBertTokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config:
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -193,80 +187,24 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)
+ @classproperty
+ def tokenizer_cls(cls):
+ return DistilBertTokenizer
+
@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)
@classmethod
- @format_docstring(names=", ".join(backbone_presets))
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- """Instantiate DistilBERT preprocessor from preset architecture.
-
- Args:
- preset: string. Must be one of {{names}}.
- sequence_length: int, optional. The length of the packed inputs.
- Must be equal to or smaller than the `max_sequence_length` of
- the preset. If left as default, the `max_sequence_length` of
- the preset will be used.
- truncate: string. The algorithm to truncate a list of batched
- segments to fit within `sequence_length`. The value can be
- either `round_robin` or `waterfall`:
- - `"round_robin"`: Available space is assigned one token at
- a time in a round-robin fashion to the inputs that still
- need some, until the limit is reached.
- - `"waterfall"`: The allocation of the budget is done using
- a "waterfall" algorithm that allocates quota in a
- left-to-right manner and fills up the buckets until we
- run out of budget. It supports an arbitrary number of
- segments.
-
- Examples:
- ```python
- # Load preprocessor from preset
- preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
- "distil_bert_base_en_uncased",
- )
- preprocessor("The quick brown fox jumped.")
-
- # Override sequence_length
- preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
- "distil_bert_base_en_uncased",
- sequence_length=64
- )
- preprocessor("The quick brown fox jumped.")
- ```
- """
- if preset not in cls.presets:
- raise ValueError(
- "`preset` must be one of "
- f"""{", ".join(cls.presets)}. Received: {preset}."""
- )
-
- tokenizer = DistilBertTokenizer.from_preset(preset)
+ def from_preset(cls, preset, **kwargs):
+ return super().from_preset(preset, **kwargs)
- # Use model's `max_sequence_length` if `sequence_length` unspecified;
- # otherwise check that `sequence_length` not too long.
- metadata = cls.presets[preset]
- max_sequence_length = metadata["config"]["max_sequence_length"]
- if sequence_length is not None:
- if sequence_length > max_sequence_length:
- raise ValueError(
- f"`sequence_length` cannot be longer than `{preset}` "
- f"preset's `max_sequence_length` of {max_sequence_length}. "
- f"Received: {sequence_length}."
- )
- else:
- sequence_length = max_sequence_length
- return cls(
- tokenizer=tokenizer,
- sequence_length=sequence_length,
- truncate=truncate,
- **kwargs,
- )
+DistilBertPreprocessor.from_preset.__func__.__doc__ = (
+ Preprocessor.from_preset.__doc__
+)
+format_docstring(
+ preprocessor_name=DistilBertPreprocessor.__name__,
+ example_preset_name="distil_bert_base_en_uncased",
+ preset_names='", "'.join(DistilBertPreprocessor.presets),
+)(DistilBertPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
index ca13f75177..c14cd3d163 100644
--- a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
+++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
@@ -19,7 +19,7 @@
from absl.testing import parameterized
from tensorflow import keras
-from keras_nlp.models.distil_bert.distil_bert_preprocessor import (
+from keras_nlp.models.distil_bert.distil_bert_tokenizer import (
DistilBertTokenizer,
)
diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py
new file mode 100644
index 0000000000..ec99ccc61f
--- /dev/null
+++ b/keras_nlp/models/preprocessor.py
@@ -0,0 +1,112 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tensorflow import keras
+
+from keras_nlp.utils.python_utils import classproperty
+
+
+@keras.utils.register_keras_serializable(package="keras_nlp")
+class Preprocessor(keras.layers.Layer):
+ """Base class for model preprocessors."""
+
+ @property
+ def tokenizer(self):
+ """The tokenizer used to tokenize strings."""
+ return self._tokenizer
+
+ def get_config(self):
+ config = super().get_config()
+ config["tokenizer"] = keras.layers.serialize(self.tokenizer)
+ return config
+
+ @classmethod
+ def from_config(cls, config):
+ if "tokenizer" in config and isinstance(config["tokenizer"], dict):
+ config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
+ return cls(**config)
+
+ @classproperty
+ def tokenizer_cls(cls):
+ return None
+
+ @classproperty
+ def presets(cls):
+ return {}
+
+ @classmethod
+ def from_preset(
+ cls,
+ preset,
+ **kwargs,
+ ):
+ """Instantiate {{preprocessor_name}} from preset architecture.
+
+ Args:
+ preset: string. Must be one of "{{preset_names}}".
+
+ Examples:
+ ```python
+ # Load preprocessor from preset
+ preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
+ "{{example_preset_name}}",
+ )
+ preprocessor("The quick brown fox jumped.")
+
+ # Override sequence_length
+ preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
+ "{{example_preset_name}}",
+ sequence_length=64
+ )
+ preprocessor("The quick brown fox jumped.")
+ ```
+ """
+ if not cls.presets:
+ raise NotImplementedError(
+ "No presets have been created for this class."
+ )
+ if preset not in cls.presets:
+ raise ValueError(
+ "`preset` must be one of "
+ f"""{", ".join(cls.presets)}. Received: {preset}."""
+ )
+
+ tokenizer = cls.tokenizer_cls.from_preset(preset)
+
+ metadata = cls.presets[preset]
+ # For task model presets, the backbone config is nested.
+ if "backbone" in metadata["config"]:
+ backbone_config = metadata["config"]["backbone"]["config"]
+ else:
+ backbone_config = metadata["config"]
+
+ # Use model's `max_sequence_length` if `sequence_length` unspecified;
+ # otherwise check that `sequence_length` not too long.
+ sequence_length = kwargs.pop("sequence_length", None)
+ max_sequence_length = backbone_config["max_sequence_length"]
+ if sequence_length is not None:
+ if sequence_length > max_sequence_length:
+ raise ValueError(
+ f"`sequence_length` cannot be longer than `{preset}` "
+ f"preset's `max_sequence_length` of {max_sequence_length}. "
+ f"Received: {sequence_length}."
+ )
+ else:
+ sequence_length = max_sequence_length
+
+ return cls(
+ tokenizer=tokenizer,
+ sequence_length=sequence_length,
+ **kwargs,
+ )
diff --git a/keras_nlp/models/roberta/roberta_preprocessor.py b/keras_nlp/models/roberta/roberta_preprocessor.py
index 8eaa1595e0..ac28f49c2d 100644
--- a/keras_nlp/models/roberta/roberta_preprocessor.py
+++ b/keras_nlp/models/roberta/roberta_preprocessor.py
@@ -20,6 +20,7 @@
import tensorflow_text as tf_text
from tensorflow import keras
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.models.roberta.roberta_presets import backbone_presets
from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
from keras_nlp.utils.keras_utils import (
@@ -31,7 +32,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class RobertaPreprocessor(keras.layers.Layer):
+class RobertaPreprocessor(Preprocessor):
"""RoBERTa preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
@@ -77,32 +78,8 @@ class RobertaPreprocessor(keras.layers.Layer):
Examples:
```python
- vocab = {
- "": 0,
- "": 1,
- "": 2,
- "reful": 3,
- "gent": 4,
- "Ġafter": 5,
- "noon": 6,
- "Ġsun": 7,
- "Ġbright": 8,
- "Ġnight": 9,
- "Ġmoon": 10,
- }
- merges = ["Ġ a", "Ġ m", "Ġ s", "Ġ b", "Ġ n", "r e", "f u", "g e", "n t"]
- merges += ["e r", "n o", "o n", "i g", "h t"]
- merges += ["Ġs u", "Ġa f", "Ġm o", "Ġb r","ge nt", "no on", "re fu", "ig ht"]
- merges += ["Ġn ight", "Ġsu n", "Ġaf t", "Ġmo on", "Ġbr ight", "refu l", "Ġaft er"]
-
- tokenizer = keras_nlp.models.RobertaTokenizer(
- vocabulary=vocab,
- merges=merges,
- )
- preprocessor = keras_nlp.models.RobertaPreprocessor(
- tokenizer=tokenizer,
- sequence_length=20,
- )
+ # Load the preprocessor from a preset.
+ preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
# Tokenize and pack a single sentence.
sentence = tf.constant(" afternoon sun")
@@ -152,6 +129,28 @@ class RobertaPreprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)
+
+ # Alternatively, you can create a preprocessor from your own vocabulary.
+ # The usage is exactly the same as above.
+ vocab = {
+ "": 0,
+ "": 1,
+ "": 2,
+ "Ġafter": 5,
+ "noon": 6,
+ "Ġsun": 7,
+ }
+ merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"]
+ merges += ["Ġsu n", "Ġaf t", "Ġaft er"]
+
+ tokenizer = keras_nlp.models.RobertaTokenizer(
+ vocabulary=vocab,
+ merges=merges,
+ )
+ preprocessor = keras_nlp.models.RobertaPreprocessor(
+ tokenizer=tokenizer,
+ sequence_length=20,
+ )
```
"""
@@ -165,7 +164,6 @@ def __init__(
super().__init__(**kwargs)
self._tokenizer = tokenizer
-
self.packer = RobertaMultiSegmentPacker(
start_value=self.tokenizer.start_token_id,
end_value=self.tokenizer.end_token_id,
@@ -174,28 +172,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.RobertaTokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config and isinstance(config["tokenizer"], dict):
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -206,83 +192,27 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)
+ @classproperty
+ def tokenizer_cls(cls):
+ return RobertaTokenizer
+
@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)
@classmethod
- @format_docstring(names=", ".join(backbone_presets))
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- """Instantiate RoBERTa preprocessor from preset architecture.
-
- Args:
- preset: string. Must be one of {{names}}.
- sequence_length: int, optional. The length of the packed inputs.
- Must be equal to or smaller than the `max_sequence_length` of
- the preset. If left as default, the `max_sequence_length` of
- the preset will be used.
- truncate: string. The algorithm to truncate a list of batched
- segments to fit within `sequence_length`. The value can be
- either `round_robin` or `waterfall`:
- - `"round_robin"`: Available space is assigned one token at
- a time in a round-robin fashion to the inputs that still
- need some, until the limit is reached.
- - `"waterfall"`: The allocation of the budget is done using
- a "waterfall" algorithm that allocates quota in a
- left-to-right manner and fills up the buckets until we
- run out of budget. It supports an arbitrary number of
- segments.
-
- Examples:
- ```python
- # Load preprocessor from preset
- preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
- "roberta_base_en",
- )
- preprocessor("The quick brown fox jumped.")
+ def from_preset(cls, preset, **kwargs):
+ return super().from_preset(preset, **kwargs)
- # Override sequence_length
- preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
- "roberta_base_en",
- sequence_length=64
- )
- preprocessor("The quick brown fox jumped.")
- ```
- """
- if preset not in cls.presets:
- raise ValueError(
- "`preset` must be one of "
- f"""{", ".join(cls.presets)}. Received: {preset}."""
- )
- tokenizer = RobertaTokenizer.from_preset(preset)
-
- # Use model's `max_sequence_length` if `sequence_length` unspecified;
- # otherwise check that `sequence_length` not too long.
- metadata = cls.presets[preset]
- max_sequence_length = metadata["config"]["max_sequence_length"]
- if sequence_length is not None:
- if sequence_length > max_sequence_length:
- raise ValueError(
- f"`sequence_length` cannot be longer than `{preset}` "
- f"preset's `max_sequence_length` of {max_sequence_length}. "
- f"Received: {sequence_length}."
- )
- else:
- sequence_length = max_sequence_length
-
- return cls(
- tokenizer=tokenizer,
- sequence_length=sequence_length,
- truncate=truncate,
- **kwargs,
- )
+RobertaPreprocessor.from_preset.__func__.__doc__ = (
+ Preprocessor.from_preset.__doc__
+)
+format_docstring(
+ preprocessor_name=RobertaPreprocessor.__name__,
+ example_preset_name="roberta_base_en",
+ preset_names='", "'.join(RobertaPreprocessor.presets),
+)(RobertaPreprocessor.from_preset.__func__)
# TODO: This is a temporary, unexported layer until we find a way to make the
diff --git a/keras_nlp/models/roberta/roberta_preprocessor_test.py b/keras_nlp/models/roberta/roberta_preprocessor_test.py
index d4c6653e20..2045408547 100644
--- a/keras_nlp/models/roberta/roberta_preprocessor_test.py
+++ b/keras_nlp/models/roberta/roberta_preprocessor_test.py
@@ -21,7 +21,7 @@
from tensorflow import keras
from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor
-from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer
+from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
class RobertaPreprocessorTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/roberta/roberta_tokenizer_test.py b/keras_nlp/models/roberta/roberta_tokenizer_test.py
index 558ca100fb..0d09b48451 100644
--- a/keras_nlp/models/roberta/roberta_tokenizer_test.py
+++ b/keras_nlp/models/roberta/roberta_tokenizer_test.py
@@ -20,7 +20,7 @@
from absl.testing import parameterized
from tensorflow import keras
-from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer
+from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
class RobertaTokenizerTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
index 17e30618bd..eaf83147a3 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
@@ -28,7 +28,7 @@
from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
XLMRobertaPreprocessor,
)
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
XLMRobertaTokenizer,
)
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
index ff9ef35914..a21bddae10 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
@@ -18,6 +18,7 @@
from tensorflow import keras
+from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.models.roberta.roberta_preprocessor import (
RobertaMultiSegmentPacker,
)
@@ -34,7 +35,7 @@
@keras.utils.register_keras_serializable(package="keras_nlp")
-class XLMRobertaPreprocessor(keras.layers.Layer):
+class XLMRobertaPreprocessor(Preprocessor):
"""XLM-RoBERTa preprocessing layer.
This preprocessing layer will do three things:
@@ -80,11 +81,8 @@ class XLMRobertaPreprocessor(keras.layers.Layer):
Examples:
```python
- tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
- tokenizer=tokenizer,
- sequence_length=10,
- )
+ # Load the preprocessor from a preset.
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi")
# Tokenize and pack a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
@@ -144,6 +142,14 @@ class XLMRobertaPreprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)
+
+ # Alternatively, you can create a preprocessor from your own vocabulary.
+ # The usage is exactly the same as above.
+ tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
+ tokenizer=tokenizer,
+ sequence_length=10,
+ )
```
"""
@@ -166,28 +172,16 @@ def __init__(
sequence_length=sequence_length,
)
- @property
- def tokenizer(self):
- """The `keras_nlp.models.XLMRobertaTokenizer` used to tokenize strings."""
- return self._tokenizer
-
def get_config(self):
config = super().get_config()
config.update(
{
- "tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config
- @classmethod
- def from_config(cls, config):
- if "tokenizer" in config:
- config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
- return cls(**config)
-
def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
@@ -198,80 +192,24 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)
+ @classproperty
+ def tokenizer_cls(cls):
+ return XLMRobertaTokenizer
+
@classproperty
def presets(cls):
return copy.deepcopy(backbone_presets)
@classmethod
- @format_docstring(names=", ".join(backbone_presets))
- def from_preset(
- cls,
- preset,
- sequence_length=None,
- truncate="round_robin",
- **kwargs,
- ):
- """Instantiate XLM-RoBERTa preprocessor from preset architecture.
-
- Args:
- preset: string. Must be one of {{names}}.
- sequence_length: int, optional. The length of the packed inputs.
- Must be equal to or smaller than the `max_sequence_length` of
- the preset. If left as default, the `max_sequence_length` of
- the preset will be used.
- truncate: string. The algorithm to truncate a list of batched
- segments to fit within `sequence_length`. The value can be
- either `round_robin` or `waterfall`:
- - `"round_robin"`: Available space is assigned one token at
- a time in a round-robin fashion to the inputs that still
- need some, until the limit is reached.
- - `"waterfall"`: The allocation of the budget is done using
- a "waterfall" algorithm that allocates quota in a
- left-to-right manner and fills up the buckets until we
- run out of budget. It supports an arbitrary number of
- segments.
-
- Examples:
- ```python
- # Load preprocessor from preset
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
- "xlm_roberta_base_multi",
- )
- preprocessor("The quick brown fox jumped.")
+ def from_preset(cls, preset, **kwargs):
+ return super().from_preset(preset, **kwargs)
- # Override sequence_length
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
- "xlm_roberta_base_multi",
- sequence_length=64
- )
- preprocessor("The quick brown fox jumped.")
- ```
- """
- if preset not in cls.presets:
- raise ValueError(
- "`preset` must be one of "
- f"""{", ".join(cls.presets)}. Received: {preset}."""
- )
-
- tokenizer = XLMRobertaTokenizer.from_preset(preset)
- # Use model's `max_sequence_length` if `sequence_length` unspecified;
- # otherwise check that `sequence_length` not too long.
- metadata = cls.presets[preset]
- max_sequence_length = metadata["config"]["max_sequence_length"]
- if sequence_length is not None:
- if sequence_length > max_sequence_length:
- raise ValueError(
- f"`sequence_length` cannot be longer than `{preset}` "
- f"preset's `max_sequence_length` of {max_sequence_length}. "
- f"Received: {sequence_length}."
- )
- else:
- sequence_length = max_sequence_length
-
- return cls(
- tokenizer=tokenizer,
- sequence_length=sequence_length,
- truncate=truncate,
- **kwargs,
- )
+XLMRobertaPreprocessor.from_preset.__func__.__doc__ = (
+ Preprocessor.from_preset.__doc__
+)
+format_docstring(
+ preprocessor_name=XLMRobertaPreprocessor.__name__,
+ example_preset_name="xlm_roberta_base_multi",
+ preset_names='", "'.join(XLMRobertaPreprocessor.presets),
+)(XLMRobertaPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
index 279d43cdcb..8074aa1d8d 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
@@ -25,7 +25,7 @@
from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
XLMRobertaPreprocessor,
)
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
XLMRobertaTokenizer,
)
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
index 8d1c6c9213..095f92060e 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
@@ -22,7 +22,7 @@
from absl.testing import parameterized
from tensorflow import keras
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
XLMRobertaTokenizer,
)