diff --git a/keras_nlp/models/albert/albert_preprocessor.py b/keras_nlp/models/albert/albert_preprocessor.py
index 61d37c34d2..e34fe54536 100644
--- a/keras_nlp/models/albert/albert_preprocessor.py
+++ b/keras_nlp/models/albert/albert_preprocessor.py
@@ -16,6 +16,8 @@
 from tensorflow import keras
 
 from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
+from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.utils.keras_utils import (
     convert_inputs_to_list_of_tensor_segments,
 )
@@ -24,7 +26,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class AlbertPreprocessor(keras.layers.Layer):
+class AlbertPreprocessor(Preprocessor):
     """An ALBERT preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
@@ -153,28 +155,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.AlbertTokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config and isinstance(config["tokenizer"], dict):
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -187,15 +177,5 @@ def call(self, x, y=None, sample_weight=None):
         return pack_x_y_sample_weight(x, y, sample_weight)
 
     @classproperty
-    def presets(cls):
-        return {}
-
-    @classmethod
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        raise NotImplementedError
+    def tokenizer_cls(cls):
+        return AlbertTokenizer
diff --git a/keras_nlp/models/bert/bert_preprocessor.py b/keras_nlp/models/bert/bert_preprocessor.py
index f376c47f87..b1ce9c3293 100644
--- a/keras_nlp/models/bert/bert_preprocessor.py
+++ b/keras_nlp/models/bert/bert_preprocessor.py
@@ -21,6 +21,7 @@
 from keras_nlp.models.bert.bert_presets import backbone_presets
 from keras_nlp.models.bert.bert_presets import classifier_presets
 from keras_nlp.models.bert.bert_tokenizer import BertTokenizer
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.utils.keras_utils import (
     convert_inputs_to_list_of_tensor_segments,
 )
@@ -32,7 +33,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class BertPreprocessor(keras.layers.Layer):
+class BertPreprocessor(Preprocessor):
     """A BERT preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
@@ -76,13 +77,8 @@ class BertPreprocessor(keras.layers.Layer):
 
     Examples:
     ```python
-    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
-    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
-    vocab += ["Call", "me", "Ish", "##mael", "."]
-    vocab += ["Oh", "look", "a", "whale"]
-    vocab += ["I", "forgot", "my", "home", "##work"]
-    tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
-    preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
+    # Load the preprocessor from a preset.
+    preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased")
 
     # Tokenize and pack a single sentence.
     sentence = tf.constant("The quick brown fox jumped.")
@@ -142,6 +138,16 @@ class BertPreprocessor(keras.layers.Layer):
         lambda s1, s2: preprocessor(x=(s1, s2)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
+
+    # Alternatively, you can create a preprocessor from your own vocabulary.
+    # The usage is exactly the same as shown above.
+    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
+    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
+    vocab += ["Call", "me", "Ish", "##mael", "."]
+    vocab += ["Oh", "look", "a", "whale"]
+    vocab += ["I", "forgot", "my", "home", "##work"]
+    tokenizer = keras_nlp.models.BertTokenizer(vocabulary=vocab)
+    preprocessor = keras_nlp.models.BertPreprocessor(tokenizer)
     ```
     """
 
@@ -162,28 +168,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.BertTokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config and isinstance(config["tokenizer"], dict):
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -195,85 +189,22 @@ def call(self, x, y=None, sample_weight=None):
         }
         return pack_x_y_sample_weight(x, y, sample_weight)
 
+    @classproperty
+    def tokenizer_cls(cls):
+        return BertTokenizer
+
     @classproperty
     def presets(cls):
         return copy.deepcopy({**backbone_presets, **classifier_presets})
 
     @classmethod
-    @format_docstring(names=PRESET_NAMES)
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        """Instantiate BERT preprocessor from preset architecture.
-
-        Args:
-            preset: string. Must be one of {{names}}.
-            sequence_length: int, optional. The length of the packed inputs.
-                Must be equal to or smaller than the `max_sequence_length` of
-                the preset. If left as default, the `max_sequence_length` of
-                the preset will be used.
-            truncate: string. The algorithm to truncate a list of batched
-                segments to fit within `sequence_length`. The value can be
-                either `round_robin` or `waterfall`:
-                    - `"round_robin"`: Available space is assigned one token at
-                        a time in a round-robin fashion to the inputs that still
-                        need some, until the limit is reached.
-                    - `"waterfall"`: The allocation of the budget is done using
-                        a "waterfall" algorithm that allocates quota in a
-                        left-to-right manner and fills up the buckets until we
-                        run out of budget. It supports an arbitrary number of
-                        segments.
+    def from_preset(cls, preset, **kwargs):
+        return super().from_preset(preset, **kwargs)
 
-        Examples:
-        ```python
-        # Load preprocessor from preset
-        preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
-            "bert_base_en_uncased",
-        )
-        preprocessor("The quick brown fox jumped.")
-
-        # Override sequence_length
-        preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
-            "bert_base_en_uncased",
-            sequence_length=64
-        )
-        preprocessor("The quick brown fox jumped.")
-        ```
-        """
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
-
-        tokenizer = BertTokenizer.from_preset(preset)
 
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        metadata = cls.presets[preset]
-        if preset in backbone_presets:
-            backbone_config = metadata["config"]
-        else:
-            # For task model presets, the backbone config is nested.
-            backbone_config = metadata["config"]["backbone"]["config"]
-        max_sequence_length = backbone_config["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
-
-        return cls(
-            tokenizer=tokenizer,
-            sequence_length=sequence_length,
-            truncate=truncate,
-            **kwargs,
-        )
+BertPreprocessor.from_preset.__func__.__doc__ = Preprocessor.from_preset.__doc__
+format_docstring(
+    preprocessor_name=BertPreprocessor.__name__,
+    example_preset_name="bert_base_en_uncased",
+    preset_names='", "'.join(BertPreprocessor.presets),
+)(BertPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
index 81645c9dbe..775b254fe3 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py
@@ -28,9 +28,7 @@
 from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
     DebertaV3Preprocessor,
 )
-from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import (
-    DebertaV3Tokenizer,
-)
+from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
 
 
 class DebertaV3ClassifierTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
index 649318ffdc..9050c20292 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
@@ -20,6 +20,7 @@
 from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
 from keras_nlp.models.deberta_v3.deberta_v3_presets import backbone_presets
 from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.utils.keras_utils import (
     convert_inputs_to_list_of_tensor_segments,
 )
@@ -29,7 +30,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class DebertaV3Preprocessor(keras.layers.Layer):
+class DebertaV3Preprocessor(Preprocessor):
     """A DeBERTa preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
@@ -73,11 +74,8 @@ class DebertaV3Preprocessor(keras.layers.Layer):
 
     Examples:
     ```python
-    tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
-    preprocessor = keras_nlp.models.DebertaV3Preprocessor(
-        tokenizer=tokenizer,
-        sequence_length=10,
-    )
+    # Load the preprocessor from a preset.
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset("deberta_v3_base_en")
 
     # Tokenize and pack a single sentence.
     sentence = tf.constant("The quick brown fox jumped.")
@@ -137,6 +135,14 @@ class DebertaV3Preprocessor(keras.layers.Layer):
         lambda s1, s2: preprocessor(x=(s1, s2)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
+
+    # Alternatively, you can create a preprocessor from your own vocabulary.
+    # The usage is the exactly same as above.
+    tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor(
+        tokenizer=tokenizer,
+        sequence_length=10,
+    )
     ```
     """
 
@@ -157,28 +163,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.DebertaV3Tokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config and isinstance(config["tokenizer"], dict):
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -189,80 +183,24 @@ def call(self, x, y=None, sample_weight=None):
         }
         return pack_x_y_sample_weight(x, y, sample_weight)
 
+    @classproperty
+    def tokenizer_cls(cls):
+        return DebertaV3Tokenizer
+
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
 
     @classmethod
-    @format_docstring(names=", ".join(backbone_presets))
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        """Instantiate DeBERTa preprocessor from preset architecture.
-
-        Args:
-            preset: string. Must be one of {{names}}.
-            sequence_length: int, optional. The length of the packed inputs.
-                Must be equal to or smaller than the `max_sequence_length` of
-                the preset. If left as default, the `max_sequence_length` of
-                the preset will be used.
-            truncate: string. The algorithm to truncate a list of batched
-                segments to fit within `sequence_length`. The value can be
-                either `round_robin` or `waterfall`:
-                    - `"round_robin"`: Available space is assigned one token at
-                        a time in a round-robin fashion to the inputs that still
-                        need some, until the limit is reached.
-                    - `"waterfall"`: The allocation of the budget is done using
-                        a "waterfall" algorithm that allocates quota in a
-                        left-to-right manner and fills up the buckets until we
-                        run out of budget. It supports an arbitrary number of
-                        segments.
-
-        Examples:
-        ```python
-        # Load preprocessor from preset
-        preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
-            "deberta_v3_base_en",
-        )
-        preprocessor("The quick brown fox jumped.")
+    def from_preset(cls, preset, **kwargs):
+        return super().from_preset(preset, **kwargs)
 
-        # Override sequence_length
-        preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
-            "deberta_v3_base_en",
-            sequence_length=64
-        )
-        preprocessor("The quick brown fox jumped.")
-        ```
-        """
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
-
-        tokenizer = DebertaV3Tokenizer.from_preset(preset)
 
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        metadata = cls.presets[preset]
-        max_sequence_length = metadata["config"]["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
-
-        return cls(
-            tokenizer=tokenizer,
-            sequence_length=sequence_length,
-            truncate=truncate,
-            **kwargs,
-        )
+DebertaV3Preprocessor.from_preset.__func__.__doc__ = (
+    Preprocessor.from_preset.__doc__
+)
+format_docstring(
+    preprocessor_name=DebertaV3Preprocessor.__name__,
+    example_preset_name="deberta_v3_base_en",
+    preset_names='", "'.join(DebertaV3Preprocessor.presets),
+)(DebertaV3Preprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
index 75cc3cb845..6a190fda3b 100644
--- a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
+++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
@@ -22,6 +22,7 @@
 from keras_nlp.models.distil_bert.distil_bert_tokenizer import (
     DistilBertTokenizer,
 )
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.utils.keras_utils import (
     convert_inputs_to_list_of_tensor_segments,
 )
@@ -31,7 +32,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class DistilBertPreprocessor(keras.layers.Layer):
+class DistilBertPreprocessor(Preprocessor):
     """A DistilBERT preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
@@ -75,13 +76,8 @@ class DistilBertPreprocessor(keras.layers.Layer):
 
     Examples:
     ```python
-    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
-    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
-    vocab += ["Call", "me", "Ish", "##mael", "."]
-    vocab += ["Oh", "look", "a", "whale"]
-    vocab += ["I", "forgot", "my", "home", "##work"]
-    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
+    # Load the preprocessor from a preset.
+    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset("distil_bert_base_en_uncased")
 
     # Tokenize and pack a single sentence.
     sentence = tf.constant("The quick brown fox jumped.")
@@ -141,6 +137,16 @@ class DistilBertPreprocessor(keras.layers.Layer):
         lambda s1, s2: preprocessor(x=(s1, s2)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
+
+    # Alternatively, you can create a preprocessor from your own vocabulary.
+    # The usage is exactly the same as above.
+    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
+    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
+    vocab += ["Call", "me", "Ish", "##mael", "."]
+    vocab += ["Oh", "look", "a", "whale"]
+    vocab += ["I", "forgot", "my", "home", "##work"]
+    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
+    preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
     ```
     """
 
@@ -161,28 +167,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.DistilBertTokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config:
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -193,80 +187,24 @@ def call(self, x, y=None, sample_weight=None):
         }
         return pack_x_y_sample_weight(x, y, sample_weight)
 
+    @classproperty
+    def tokenizer_cls(cls):
+        return DistilBertTokenizer
+
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
 
     @classmethod
-    @format_docstring(names=", ".join(backbone_presets))
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        """Instantiate DistilBERT preprocessor from preset architecture.
-
-        Args:
-            preset: string. Must be one of {{names}}.
-            sequence_length: int, optional. The length of the packed inputs.
-                Must be equal to or smaller than the `max_sequence_length` of
-                the preset. If left as default, the `max_sequence_length` of
-                the preset will be used.
-            truncate: string. The algorithm to truncate a list of batched
-                segments to fit within `sequence_length`. The value can be
-                either `round_robin` or `waterfall`:
-                    - `"round_robin"`: Available space is assigned one token at
-                        a time in a round-robin fashion to the inputs that still
-                        need some, until the limit is reached.
-                    - `"waterfall"`: The allocation of the budget is done using
-                        a "waterfall" algorithm that allocates quota in a
-                        left-to-right manner and fills up the buckets until we
-                        run out of budget. It supports an arbitrary number of
-                        segments.
-
-        Examples:
-        ```python
-        # Load preprocessor from preset
-        preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
-            "distil_bert_base_en_uncased",
-        )
-        preprocessor("The quick brown fox jumped.")
-
-        # Override sequence_length
-        preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
-            "distil_bert_base_en_uncased",
-            sequence_length=64
-        )
-        preprocessor("The quick brown fox jumped.")
-        ```
-        """
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
-
-        tokenizer = DistilBertTokenizer.from_preset(preset)
+    def from_preset(cls, preset, **kwargs):
+        return super().from_preset(preset, **kwargs)
 
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        metadata = cls.presets[preset]
-        max_sequence_length = metadata["config"]["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
 
-        return cls(
-            tokenizer=tokenizer,
-            sequence_length=sequence_length,
-            truncate=truncate,
-            **kwargs,
-        )
+DistilBertPreprocessor.from_preset.__func__.__doc__ = (
+    Preprocessor.from_preset.__doc__
+)
+format_docstring(
+    preprocessor_name=DistilBertPreprocessor.__name__,
+    example_preset_name="distil_bert_base_en_uncased",
+    preset_names='", "'.join(DistilBertPreprocessor.presets),
+)(DistilBertPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
index ca13f75177..c14cd3d163 100644
--- a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
+++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py
@@ -19,7 +19,7 @@
 from absl.testing import parameterized
 from tensorflow import keras
 
-from keras_nlp.models.distil_bert.distil_bert_preprocessor import (
+from keras_nlp.models.distil_bert.distil_bert_tokenizer import (
     DistilBertTokenizer,
 )
 
diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py
new file mode 100644
index 0000000000..ec99ccc61f
--- /dev/null
+++ b/keras_nlp/models/preprocessor.py
@@ -0,0 +1,112 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tensorflow import keras
+
+from keras_nlp.utils.python_utils import classproperty
+
+
+@keras.utils.register_keras_serializable(package="keras_nlp")
+class Preprocessor(keras.layers.Layer):
+    """Base class for model preprocessors."""
+
+    @property
+    def tokenizer(self):
+        """The tokenizer used to tokenize strings."""
+        return self._tokenizer
+
+    def get_config(self):
+        config = super().get_config()
+        config["tokenizer"] = keras.layers.serialize(self.tokenizer)
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        if "tokenizer" in config and isinstance(config["tokenizer"], dict):
+            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
+        return cls(**config)
+
+    @classproperty
+    def tokenizer_cls(cls):
+        return None
+
+    @classproperty
+    def presets(cls):
+        return {}
+
+    @classmethod
+    def from_preset(
+        cls,
+        preset,
+        **kwargs,
+    ):
+        """Instantiate {{preprocessor_name}} from preset architecture.
+
+        Args:
+            preset: string. Must be one of "{{preset_names}}".
+
+        Examples:
+        ```python
+        # Load preprocessor from preset
+        preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
+            "{{example_preset_name}}",
+        )
+        preprocessor("The quick brown fox jumped.")
+
+        # Override sequence_length
+        preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
+            "{{example_preset_name}}",
+            sequence_length=64
+        )
+        preprocessor("The quick brown fox jumped.")
+        ```
+        """
+        if not cls.presets:
+            raise NotImplementedError(
+                "No presets have been created for this class."
+            )
+        if preset not in cls.presets:
+            raise ValueError(
+                "`preset` must be one of "
+                f"""{", ".join(cls.presets)}. Received: {preset}."""
+            )
+
+        tokenizer = cls.tokenizer_cls.from_preset(preset)
+
+        metadata = cls.presets[preset]
+        # For task model presets, the backbone config is nested.
+        if "backbone" in metadata["config"]:
+            backbone_config = metadata["config"]["backbone"]["config"]
+        else:
+            backbone_config = metadata["config"]
+
+        # Use model's `max_sequence_length` if `sequence_length` unspecified;
+        # otherwise check that `sequence_length` not too long.
+        sequence_length = kwargs.pop("sequence_length", None)
+        max_sequence_length = backbone_config["max_sequence_length"]
+        if sequence_length is not None:
+            if sequence_length > max_sequence_length:
+                raise ValueError(
+                    f"`sequence_length` cannot be longer than `{preset}` "
+                    f"preset's `max_sequence_length` of {max_sequence_length}. "
+                    f"Received: {sequence_length}."
+                )
+        else:
+            sequence_length = max_sequence_length
+
+        return cls(
+            tokenizer=tokenizer,
+            sequence_length=sequence_length,
+            **kwargs,
+        )
diff --git a/keras_nlp/models/roberta/roberta_preprocessor.py b/keras_nlp/models/roberta/roberta_preprocessor.py
index 8eaa1595e0..ac28f49c2d 100644
--- a/keras_nlp/models/roberta/roberta_preprocessor.py
+++ b/keras_nlp/models/roberta/roberta_preprocessor.py
@@ -20,6 +20,7 @@
 import tensorflow_text as tf_text
 from tensorflow import keras
 
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.models.roberta.roberta_presets import backbone_presets
 from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_nlp.utils.keras_utils import (
@@ -31,7 +32,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class RobertaPreprocessor(keras.layers.Layer):
+class RobertaPreprocessor(Preprocessor):
     """RoBERTa preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
@@ -77,32 +78,8 @@ class RobertaPreprocessor(keras.layers.Layer):
 
     Examples:
     ```python
-    vocab = {
-        "<s>": 0,
-        "<pad>": 1,
-        "</s>": 2,
-        "reful": 3,
-        "gent": 4,
-        "Ġafter": 5,
-        "noon": 6,
-        "Ġsun": 7,
-        "Ġbright": 8,
-        "Ġnight": 9,
-        "Ġmoon": 10,
-    }
-    merges = ["Ġ a", "Ġ m", "Ġ s", "Ġ b", "Ġ n", "r e", "f u", "g e", "n t"]
-    merges += ["e r", "n o", "o n", "i g", "h t"]
-    merges += ["Ġs u", "Ġa f", "Ġm o", "Ġb r","ge nt", "no on", "re fu", "ig ht"]
-    merges += ["Ġn ight", "Ġsu n", "Ġaf t", "Ġmo on", "Ġbr ight", "refu l", "Ġaft er"]
-
-    tokenizer = keras_nlp.models.RobertaTokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.RobertaPreprocessor(
-        tokenizer=tokenizer,
-        sequence_length=20,
-    )
+    # Load the preprocessor from a preset.
+    preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
 
     # Tokenize and pack a single sentence.
     sentence = tf.constant(" afternoon sun")
@@ -152,6 +129,28 @@ class RobertaPreprocessor(keras.layers.Layer):
         lambda s1, s2: preprocessor(x=(s1, s2)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
+
+    # Alternatively, you can create a preprocessor from your own vocabulary.
+    # The usage is exactly the same as above.
+    vocab = {
+        "<s>": 0,
+        "<pad>": 1,
+        "</s>": 2,
+        "Ġafter": 5,
+        "noon": 6,
+        "Ġsun": 7,
+    }
+    merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"]
+    merges += ["Ġsu n", "Ġaf t", "Ġaft er"]
+
+    tokenizer = keras_nlp.models.RobertaTokenizer(
+        vocabulary=vocab,
+        merges=merges,
+    )
+    preprocessor = keras_nlp.models.RobertaPreprocessor(
+        tokenizer=tokenizer,
+        sequence_length=20,
+    )
     ```
     """
 
@@ -165,7 +164,6 @@ def __init__(
         super().__init__(**kwargs)
 
         self._tokenizer = tokenizer
-
         self.packer = RobertaMultiSegmentPacker(
             start_value=self.tokenizer.start_token_id,
             end_value=self.tokenizer.end_token_id,
@@ -174,28 +172,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.RobertaTokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config and isinstance(config["tokenizer"], dict):
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -206,83 +192,27 @@ def call(self, x, y=None, sample_weight=None):
         }
         return pack_x_y_sample_weight(x, y, sample_weight)
 
+    @classproperty
+    def tokenizer_cls(cls):
+        return RobertaTokenizer
+
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
 
     @classmethod
-    @format_docstring(names=", ".join(backbone_presets))
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        """Instantiate RoBERTa preprocessor from preset architecture.
-
-        Args:
-            preset: string. Must be one of {{names}}.
-            sequence_length: int, optional. The length of the packed inputs.
-                Must be equal to or smaller than the `max_sequence_length` of
-                the preset. If left as default, the `max_sequence_length` of
-                the preset will be used.
-            truncate: string. The algorithm to truncate a list of batched
-                segments to fit within `sequence_length`. The value can be
-                either `round_robin` or `waterfall`:
-                    - `"round_robin"`: Available space is assigned one token at
-                        a time in a round-robin fashion to the inputs that still
-                        need some, until the limit is reached.
-                    - `"waterfall"`: The allocation of the budget is done using
-                        a "waterfall" algorithm that allocates quota in a
-                        left-to-right manner and fills up the buckets until we
-                        run out of budget. It supports an arbitrary number of
-                        segments.
-
-        Examples:
-        ```python
-        # Load preprocessor from preset
-        preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
-            "roberta_base_en",
-        )
-        preprocessor("The quick brown fox jumped.")
+    def from_preset(cls, preset, **kwargs):
+        return super().from_preset(preset, **kwargs)
 
-        # Override sequence_length
-        preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
-            "roberta_base_en",
-            sequence_length=64
-        )
-        preprocessor("The quick brown fox jumped.")
-        ```
-        """
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
 
-        tokenizer = RobertaTokenizer.from_preset(preset)
-
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        metadata = cls.presets[preset]
-        max_sequence_length = metadata["config"]["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
-
-        return cls(
-            tokenizer=tokenizer,
-            sequence_length=sequence_length,
-            truncate=truncate,
-            **kwargs,
-        )
+RobertaPreprocessor.from_preset.__func__.__doc__ = (
+    Preprocessor.from_preset.__doc__
+)
+format_docstring(
+    preprocessor_name=RobertaPreprocessor.__name__,
+    example_preset_name="roberta_base_en",
+    preset_names='", "'.join(RobertaPreprocessor.presets),
+)(RobertaPreprocessor.from_preset.__func__)
 
 
 # TODO: This is a temporary, unexported layer until we find a way to make the
diff --git a/keras_nlp/models/roberta/roberta_preprocessor_test.py b/keras_nlp/models/roberta/roberta_preprocessor_test.py
index d4c6653e20..2045408547 100644
--- a/keras_nlp/models/roberta/roberta_preprocessor_test.py
+++ b/keras_nlp/models/roberta/roberta_preprocessor_test.py
@@ -21,7 +21,7 @@
 from tensorflow import keras
 
 from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor
-from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer
+from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
 
 
 class RobertaPreprocessorTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/roberta/roberta_tokenizer_test.py b/keras_nlp/models/roberta/roberta_tokenizer_test.py
index 558ca100fb..0d09b48451 100644
--- a/keras_nlp/models/roberta/roberta_tokenizer_test.py
+++ b/keras_nlp/models/roberta/roberta_tokenizer_test.py
@@ -20,7 +20,7 @@
 from absl.testing import parameterized
 from tensorflow import keras
 
-from keras_nlp.models.roberta.roberta_preprocessor import RobertaTokenizer
+from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
 
 
 class RobertaTokenizerTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
index 17e30618bd..eaf83147a3 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py
@@ -28,7 +28,7 @@
 from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
     XLMRobertaPreprocessor,
 )
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
     XLMRobertaTokenizer,
 )
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
index ff9ef35914..a21bddae10 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
@@ -18,6 +18,7 @@
 
 from tensorflow import keras
 
+from keras_nlp.models.preprocessor import Preprocessor
 from keras_nlp.models.roberta.roberta_preprocessor import (
     RobertaMultiSegmentPacker,
 )
@@ -34,7 +35,7 @@
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class XLMRobertaPreprocessor(keras.layers.Layer):
+class XLMRobertaPreprocessor(Preprocessor):
     """XLM-RoBERTa preprocessing layer.
 
     This preprocessing layer will do three things:
@@ -80,11 +81,8 @@ class XLMRobertaPreprocessor(keras.layers.Layer):
 
     Examples:
     ```python
-    tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
-    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
-        tokenizer=tokenizer,
-        sequence_length=10,
-    )
+    # Load the preprocessor from a preset.
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi")
 
     # Tokenize and pack a single sentence.
     sentence = tf.constant("The quick brown fox jumped.")
@@ -144,6 +142,14 @@ class XLMRobertaPreprocessor(keras.layers.Layer):
         lambda s1, s2: preprocessor(x=(s1, s2)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
+
+    # Alternatively, you can create a preprocessor from your own vocabulary.
+    # The usage is exactly the same as above.
+    tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
+        tokenizer=tokenizer,
+        sequence_length=10,
+    )
     ```
     """
 
@@ -166,28 +172,16 @@ def __init__(
             sequence_length=sequence_length,
         )
 
-    @property
-    def tokenizer(self):
-        """The `keras_nlp.models.XLMRobertaTokenizer` used to tokenize strings."""
-        return self._tokenizer
-
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "tokenizer": keras.layers.serialize(self.tokenizer),
                 "sequence_length": self.packer.sequence_length,
                 "truncate": self.packer.truncate,
             }
         )
         return config
 
-    @classmethod
-    def from_config(cls, config):
-        if "tokenizer" in config:
-            config["tokenizer"] = keras.layers.deserialize(config["tokenizer"])
-        return cls(**config)
-
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
@@ -198,80 +192,24 @@ def call(self, x, y=None, sample_weight=None):
         }
         return pack_x_y_sample_weight(x, y, sample_weight)
 
+    @classproperty
+    def tokenizer_cls(cls):
+        return XLMRobertaTokenizer
+
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
 
     @classmethod
-    @format_docstring(names=", ".join(backbone_presets))
-    def from_preset(
-        cls,
-        preset,
-        sequence_length=None,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        """Instantiate XLM-RoBERTa preprocessor from preset architecture.
-
-        Args:
-            preset: string. Must be one of {{names}}.
-            sequence_length: int, optional. The length of the packed inputs.
-                Must be equal to or smaller than the `max_sequence_length` of
-                the preset. If left as default, the `max_sequence_length` of
-                the preset will be used.
-            truncate: string. The algorithm to truncate a list of batched
-                segments to fit within `sequence_length`. The value can be
-                either `round_robin` or `waterfall`:
-                    - `"round_robin"`: Available space is assigned one token at
-                        a time in a round-robin fashion to the inputs that still
-                        need some, until the limit is reached.
-                    - `"waterfall"`: The allocation of the budget is done using
-                        a "waterfall" algorithm that allocates quota in a
-                        left-to-right manner and fills up the buckets until we
-                        run out of budget. It supports an arbitrary number of
-                        segments.
-
-        Examples:
-        ```python
-        # Load preprocessor from preset
-        preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
-            "xlm_roberta_base_multi",
-        )
-        preprocessor("The quick brown fox jumped.")
+    def from_preset(cls, preset, **kwargs):
+        return super().from_preset(preset, **kwargs)
 
-        # Override sequence_length
-        preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
-            "xlm_roberta_base_multi",
-            sequence_length=64
-        )
-        preprocessor("The quick brown fox jumped.")
-        ```
-        """
-        if preset not in cls.presets:
-            raise ValueError(
-                "`preset` must be one of "
-                f"""{", ".join(cls.presets)}. Received: {preset}."""
-            )
-
-        tokenizer = XLMRobertaTokenizer.from_preset(preset)
 
-        # Use model's `max_sequence_length` if `sequence_length` unspecified;
-        # otherwise check that `sequence_length` not too long.
-        metadata = cls.presets[preset]
-        max_sequence_length = metadata["config"]["max_sequence_length"]
-        if sequence_length is not None:
-            if sequence_length > max_sequence_length:
-                raise ValueError(
-                    f"`sequence_length` cannot be longer than `{preset}` "
-                    f"preset's `max_sequence_length` of {max_sequence_length}. "
-                    f"Received: {sequence_length}."
-                )
-        else:
-            sequence_length = max_sequence_length
-
-        return cls(
-            tokenizer=tokenizer,
-            sequence_length=sequence_length,
-            truncate=truncate,
-            **kwargs,
-        )
+XLMRobertaPreprocessor.from_preset.__func__.__doc__ = (
+    Preprocessor.from_preset.__doc__
+)
+format_docstring(
+    preprocessor_name=XLMRobertaPreprocessor.__name__,
+    example_preset_name="xlm_roberta_base_multi",
+    preset_names='", "'.join(XLMRobertaPreprocessor.presets),
+)(XLMRobertaPreprocessor.from_preset.__func__)
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
index 279d43cdcb..8074aa1d8d 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py
@@ -25,7 +25,7 @@
 from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
     XLMRobertaPreprocessor,
 )
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
     XLMRobertaTokenizer,
 )
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
index 8d1c6c9213..095f92060e 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py
@@ -22,7 +22,7 @@
 from absl.testing import parameterized
 from tensorflow import keras
 
-from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import (
+from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import (
     XLMRobertaTokenizer,
 )