Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Base Preprocessor Class #638

Merged
merged 18 commits into from
Jan 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 5 additions & 25 deletions keras_nlp/models/albert/albert_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from tensorflow import keras

from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
Expand All @@ -24,7 +26,7 @@


@keras.utils.register_keras_serializable(package="keras_nlp")
class AlbertPreprocessor(keras.layers.Layer):
class AlbertPreprocessor(Preprocessor):
"""An ALBERT preprocessing layer which tokenizes and packs inputs.

This preprocessing layer will do three things:
Expand Down Expand Up @@ -153,28 +155,16 @@ def __init__(
sequence_length=sequence_length,
)

@property
def tokenizer(self):
"""The `keras_nlp.models.AlbertTokenizer` used to tokenize strings."""
return self._tokenizer

def get_config(self):
config = super().get_config()
config.update(
{
"tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config

@classmethod
def from_config(cls, config):
if "tokenizer" in config and isinstance(config["tokenizer"], dict):
config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
return cls(**config)

def call(self, x, y=None, sample_weight=None):
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
Expand All @@ -187,15 +177,5 @@ def call(self, x, y=None, sample_weight=None):
return pack_x_y_sample_weight(x, y, sample_weight)

@classproperty
def presets(cls):
return {}

@classmethod
def from_preset(
cls,
preset,
sequence_length=None,
truncate="round_robin",
**kwargs,
):
raise NotImplementedError
def tokenizer_cls(cls):
return AlbertTokenizer
121 changes: 26 additions & 95 deletions keras_nlp/models/bert/bert_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from keras_nlp.models.bert.bert_presets import backbone_presets
from keras_nlp.models.bert.bert_presets import classifier_presets
from keras_nlp.models.bert.bert_tokenizer import BertTokenizer
from keras_nlp.models.preprocessor import Preprocessor
from keras_nlp.utils.keras_utils import (
convert_inputs_to_list_of_tensor_segments,
)
Expand All @@ -32,7 +33,7 @@


@keras.utils.register_keras_serializable(package="keras_nlp")
class BertPreprocessor(keras.layers.Layer):
class BertPreprocessor(Preprocessor):
"""A BERT preprocessing layer which tokenizes and packs inputs.
jbischof marked this conversation as resolved.
Show resolved Hide resolved

This preprocessing layer will do three things:
Expand Down Expand Up @@ -76,13 +77,8 @@ class BertPreprocessor(keras.layers.Layer):

Examples:
```python
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
vocab += ["Call", "me", "Ish", "##mael", "."]
vocab += ["Oh", "look", "a", "whale"]
vocab += ["I", "forgot", "my", "home", "##work"]
tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
# Load the preprocessor from a preset.
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased")

# Tokenize and pack a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
Expand Down Expand Up @@ -142,6 +138,16 @@ class BertPreprocessor(keras.layers.Layer):
lambda s1, s2: preprocessor(x=(s1, s2)),
num_parallel_calls=tf.data.AUTOTUNE,
)

# Alternatively, you can create a preprocessor from your own vocabulary.
# The usage is exactly the same as shown above.
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
vocab += ["Call", "me", "Ish", "##mael", "."]
vocab += ["Oh", "look", "a", "whale"]
vocab += ["I", "forgot", "my", "home", "##work"]
tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
```
"""

Expand All @@ -162,28 +168,16 @@ def __init__(
sequence_length=sequence_length,
)

@property
def tokenizer(self):
"""The `keras_nlp.models.BertTokenizer` used to tokenize strings."""
return self._tokenizer

def get_config(self):
config = super().get_config()
config.update(
{
"tokenizer": keras.layers.serialize(self.tokenizer),
"sequence_length": self.packer.sequence_length,
"truncate": self.packer.truncate,
}
)
return config

@classmethod
def from_config(cls, config):
if "tokenizer" in config and isinstance(config["tokenizer"], dict):
config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
return cls(**config)

def call(self, x, y=None, sample_weight=None):
jbischof marked this conversation as resolved.
Show resolved Hide resolved
x = convert_inputs_to_list_of_tensor_segments(x)
x = [self.tokenizer(segment) for segment in x]
Expand All @@ -195,85 +189,22 @@ def call(self, x, y=None, sample_weight=None):
}
return pack_x_y_sample_weight(x, y, sample_weight)

@classproperty
def tokenizer_cls(cls):
return BertTokenizer

@classproperty
def presets(cls):
return copy.deepcopy({**backbone_presets, **classifier_presets})

@classmethod
@format_docstring(names=PRESET_NAMES)
def from_preset(
cls,
preset,
sequence_length=None,
truncate="round_robin",
**kwargs,
):
"""Instantiate BERT preprocessor from preset architecture.

Args:
preset: string. Must be one of {{names}}.
sequence_length: int, optional. The length of the packed inputs.
Must be equal to or smaller than the `max_sequence_length` of
the preset. If left as default, the `max_sequence_length` of
the preset will be used.
truncate: string. The algorithm to truncate a list of batched
segments to fit within `sequence_length`. The value can be
either `round_robin` or `waterfall`:
- `"round_robin"`: Available space is assigned one token at
a time in a round-robin fashion to the inputs that still
need some, until the limit is reached.
- `"waterfall"`: The allocation of the budget is done using
a "waterfall" algorithm that allocates quota in a
left-to-right manner and fills up the buckets until we
run out of budget. It supports an arbitrary number of
segments.
def from_preset(cls, preset, **kwargs):
return super().from_preset(preset, **kwargs)

Examples:
```python
# Load preprocessor from preset
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
"bert_base_en_uncased",
)
preprocessor("The quick brown fox jumped.")

# Override sequence_length
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
"bert_base_en_uncased",
sequence_length=64
)
preprocessor("The quick brown fox jumped.")
```
"""
if preset not in cls.presets:
raise ValueError(
"`preset` must be one of "
f"""{", ".join(cls.presets)}. Received: {preset}."""
)

tokenizer = BertTokenizer.from_preset(preset)

# Use model's `max_sequence_length` if `sequence_length` unspecified;
# otherwise check that `sequence_length` not too long.
metadata = cls.presets[preset]
if preset in backbone_presets:
backbone_config = metadata["config"]
else:
# For task model presets, the backbone config is nested.
backbone_config = metadata["config"]["backbone"]["config"]
max_sequence_length = backbone_config["max_sequence_length"]
if sequence_length is not None:
if sequence_length > max_sequence_length:
raise ValueError(
f"`sequence_length` cannot be longer than `{preset}` "
f"preset's `max_sequence_length` of {max_sequence_length}. "
f"Received: {sequence_length}."
)
else:
sequence_length = max_sequence_length

return cls(
tokenizer=tokenizer,
sequence_length=sequence_length,
truncate=truncate,
**kwargs,
)
BertPreprocessor.from_preset.__func__.__doc__ = Preprocessor.from_preset.__doc__
format_docstring(
preprocessor_name=BertPreprocessor.__name__,
example_preset_name="bert_base_en_uncased",
preset_names='", "'.join(BertPreprocessor.presets),
)(BertPreprocessor.from_preset.__func__)
4 changes: 1 addition & 3 deletions keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@
from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
DebertaV3Preprocessor,
)
from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
DebertaV3Tokenizer,
)
from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer


class DebertaV3ClassifierTest(tf.test.TestCase, parameterized.TestCase):
Expand Down
Loading