diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index db1320533ff0..a200d9d64547 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/vscode/devcontainers/python:3.8
+FROM mcr.microsoft.com/vscode/devcontainers/python:3.9
 COPY setup.sh /setup.sh
 
 # Install Bazel
diff --git a/keras/__init__.py b/keras/__init__.py
index 7c020265fdac..9a57f0ffe48c 100644
--- a/keras/__init__.py
+++ b/keras/__init__.py
@@ -28,6 +28,6 @@
 from tensorflow.python import tf2
 from tensorflow.python.util.tf_export import keras_export
 
-__version__ = "2.13.0"
+__version__ = "2.14.0"
 
 keras_export("keras.__version__").export_constant(__name__, "__version__")
diff --git a/keras/constraints.py b/keras/constraints.py
index 30c23adf6d16..5bc0fe1d8043 100644
--- a/keras/constraints.py
+++ b/keras/constraints.py
@@ -16,6 +16,8 @@
 
 """Constraints: functions that impose constraints on weight values."""
 
+import warnings
+
 import tensorflow.compat.v2 as tf
 
 from keras import backend
@@ -357,6 +359,13 @@ def body_fn(i, array):
 
 @keras_export("keras.constraints.serialize")
 def serialize(constraint, use_legacy_format=False):
+    if not isinstance(constraint, Constraint):
+        warnings.warn(
+            "The `keras.constraints.serialize()` API should only be used for "
+            "objects of type `keras.constraints.Constraint`. Found an instance "
+            f"of type {type(constraint)}, which may lead to improper "
+            "serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(constraint)
     return serialize_keras_object(constraint)
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index 19b27949d84e..38cc15e33d98 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -69,13 +69,13 @@ def load_data(
       skip_top: skip the top N most frequently occurring words
           (which may not be informative). These words will appear as
           `oov_char` value in the dataset. 0 means no words are
-          skipped. Defaults to 0
+          skipped. Defaults to `0`.
       maxlen: int or None. Maximum sequence length.
           Any longer sequence will be truncated. None means no truncation.
           Defaults to `None`.
-      test_split: Float between 0 and 1. Fraction of the dataset to be used
-        as test data. 0.2 means that 20% of the dataset is used as
-        test data. Defaults to 0.2
+      test_split: Float between `0.` and `1.`. Fraction of the dataset to be
+        used as test data. `0.2` means that 20% of the dataset is used as
+        test data. Defaults to `0.2`.
       seed: int. Seed for reproducible data shuffling.
       start_char: int. The start of a sequence will be marked with this
           character. 0 is usually the padding character. Defaults to `1`.
diff --git a/keras/distribute/distributed_file_utils.py b/keras/distribute/distributed_file_utils.py
index 8ff5f280d92a..fec668cfaa59 100644
--- a/keras/distribute/distributed_file_utils.py
+++ b/keras/distribute/distributed_file_utils.py
@@ -84,7 +84,7 @@ def write_dirpath(dirpath, strategy):
       The writing dir path that should be used to save with distribution.
     """
     if strategy is None:
-        # Infer strategy from `distribution_strategy_context` if not given.
+        # Infer strategy from `tf.distribute` if not given.
         strategy = tf.distribute.get_strategy()
     if strategy is None:
         # If strategy is still not available, this is not in distributed
@@ -107,7 +107,7 @@ def remove_temp_dirpath(dirpath, strategy):
       strategy: The tf.distribute strategy object currently used.
     """
     if strategy is None:
-        # Infer strategy from `distribution_strategy_context` if not given.
+        # Infer strategy from `tf.distribute` if not given.
         strategy = tf.distribute.get_strategy()
     if strategy is None:
         # If strategy is still not available, this is not in distributed
diff --git a/keras/engine/deferred_sequential_test.py b/keras/engine/deferred_sequential_test.py
index 66e05d1a596e..8d72abbef0d6 100644
--- a/keras/engine/deferred_sequential_test.py
+++ b/keras/engine/deferred_sequential_test.py
@@ -120,6 +120,23 @@ def test_feature_extraction(self):
         # Check that inputs and outputs are connected
         _ = extractor(np.random.random((4, 6)))
 
+    @test_combinations.run_all_keras_modes(always_skip_v1=True)
+    def test_saving_keras_v3(self):
+        model = get_model()
+        model(np.random.random((3, 6)))  # Build model
+
+        path = os.path.join(self.get_temp_dir(), "model_path.keras")
+        model.save(path)
+        new_model = keras.models.load_model(path)
+        model_layers = model._flatten_layers(include_self=True, recursive=False)
+        new_model_layers = new_model._flatten_layers(
+            include_self=True, recursive=False
+        )
+        for layer1, layer2 in zip(model_layers, new_model_layers):
+            self.assertEqual(layer1.name, layer2.name)
+            for w1, w2 in zip(layer1.weights, layer2.weights):
+                self.assertAllClose(w1, w2)
+
     @test_combinations.run_all_keras_modes(always_skip_v1=True)
     def test_saving_savedmodel(self):
         model = get_model()
diff --git a/keras/engine/functional_test.py b/keras/engine/functional_test.py
index 747144caceef..302eae9d82bb 100644
--- a/keras/engine/functional_test.py
+++ b/keras/engine/functional_test.py
@@ -28,6 +28,7 @@
 from keras.engine import input_layer as input_layer_lib
 from keras.engine import sequential
 from keras.engine import training as training_lib
+from keras.saving import object_registration
 from keras.saving.legacy import save
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
@@ -1875,7 +1876,7 @@ def test_external_keras_serialization_compat_input_layers(self):
         test_combinations.combine(mode=["graph", "eager"])
     )
     @test_utils.run_v2_only
-    def test_save_load_with_single_elem_list_inputs(self):
+    def test_save_load_with_single_elem_list_inputs_saved_model(self):
         class MyLayer(layers.Layer):
             def __init__(self):
                 super().__init__()
@@ -1893,6 +1894,26 @@ def call(self, inputs):
 
         save.load_model("/tmp/km2")
 
+    @test_utils.run_v2_only
+    def test_save_load_with_single_elem_list_inputs_keras_v3(self):
+        @object_registration.register_keras_serializable()
+        class MyLayer(layers.Layer):
+            def __init__(self):
+                super().__init__()
+                self._preserve_input_structure_in_config = True
+
+            def call(self, inputs):
+                return inputs[0]
+
+        inputs = input_layer_lib.Input(shape=(3,))
+        layer = MyLayer()
+        outputs = layer([inputs])
+
+        model = training_lib.Model(inputs=inputs, outputs=outputs)
+        model.save("/tmp/model.keras")
+
+        models.load_model("/tmp/model.keras")
+
     @test_combinations.generate(
         test_combinations.combine(mode=["graph", "eager"])
     )
diff --git a/keras/engine/functional_utils_test.py b/keras/engine/functional_utils_test.py
index cf771e392679..3d5be79a157c 100644
--- a/keras/engine/functional_utils_test.py
+++ b/keras/engine/functional_utils_test.py
@@ -151,11 +151,6 @@ def test_build_model_from_intermediate_tensor(self):
         model.fit(
             np.random.randn(batch_size, 32), np.random.randn(batch_size, 16)
         )
-        # Test for model saving
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-        model.save(output_path, save_format="tf")
-        loaded_model = models.load_model(output_path)
-        self.assertEqual(model.summary(), loaded_model.summary())
 
         # Also make sure the original inputs and y can still be used to build
         # model
@@ -167,6 +162,27 @@ def test_build_model_from_intermediate_tensor(self):
         self.assertIs(new_model.layers[1], layer1)
         self.assertIs(new_model.layers[2], layer2)
 
+        # Test for model saving
+        with self.subTest("savedmodel"):
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_v3_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
     def test_build_model_from_intermediate_tensor_with_complicated_model(self):
         # The topology is like below:
         # input1 -> dense1 -> a
@@ -212,10 +228,6 @@ def test_build_model_from_intermediate_tensor_with_complicated_model(self):
             ],
             np.random.randn(batch_size, 8),
         )
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-        model.save(output_path, save_format="tf")
-        loaded_model = models.load_model(output_path)
-        self.assertEqual(model.summary(), loaded_model.summary())
 
         model2 = models.Model([a, b], d)
         # 2 input layers and 2 Add layer.
@@ -230,6 +242,26 @@ def test_build_model_from_intermediate_tensor_with_complicated_model(self):
             np.random.randn(batch_size, 8),
         )
 
+        with self.subTest("savedmodel"):
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_v3_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = models.load_model(output_path)
+            self.assertEqual(model.summary(), loaded_model.summary())
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/keras/initializers/__init__.py b/keras/initializers/__init__.py
index f89514750adb..586b4e17e60e 100644
--- a/keras/initializers/__init__.py
+++ b/keras/initializers/__init__.py
@@ -15,6 +15,7 @@
 """Keras initializer serialization / deserialization."""
 
 import threading
+import warnings
 
 import tensorflow.compat.v2 as tf
 
@@ -136,6 +137,14 @@ def populate_deserializable_objects():
 
 @keras_export("keras.initializers.serialize")
 def serialize(initializer, use_legacy_format=False):
+    populate_deserializable_objects()
+    if not isinstance(initializer, tuple(LOCAL.ALL_OBJECTS.values())):
+        warnings.warn(
+            "The `keras.initializers.serialize()` API should only be used for "
+            "objects of type `keras.initializers.Initializer`. Found an "
+            f"instance of type {type(initializer)}, which may lead to improper "
+            "serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(initializer)
 
diff --git a/keras/layers/activation/leaky_relu.py b/keras/layers/activation/leaky_relu.py
index 8c21188432c6..fa3e373d734c 100644
--- a/keras/layers/activation/leaky_relu.py
+++ b/keras/layers/activation/leaky_relu.py
@@ -54,7 +54,7 @@ class LeakyReLU(Layer):
         Same shape as the input.
 
     Args:
-        alpha: Float >= 0. Negative slope coefficient. Defaults to `0.3`.
+        alpha: Float >= `0.`. Negative slope coefficient. Defaults to `0.3`.
 
     """
 
diff --git a/keras/layers/attention/multi_head_attention_test.py b/keras/layers/attention/multi_head_attention_test.py
index 96b939ccd248..e9508cf86f4b 100644
--- a/keras/layers/attention/multi_head_attention_test.py
+++ b/keras/layers/attention/multi_head_attention_test.py
@@ -19,6 +19,7 @@
 from absl.testing import parameterized
 
 import keras
+from keras.saving import object_registration
 from keras.testing_infra import test_combinations
 from keras.testing_infra import test_utils
 
@@ -515,6 +516,7 @@ def test_initializer(self):
         self.assertEqual(output.shape.as_list(), [None, 40, 80])
 
 
+@object_registration.register_keras_serializable()
 class TestModel(keras.Model):
     def __init__(self):
         super().__init__()
@@ -540,12 +542,19 @@ def call(self, x, training=False):
 
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class KerasModelSavingTest(test_combinations.TestCase):
-    def test_keras_saving_subclass(self):
+    @parameterized.parameters("tf", "keras_v3")
+    def test_keras_saving_subclass(self, save_format):
         model = TestModel()
         query = keras.Input(shape=(40, 80))
         _ = model(query)
         model_path = self.get_temp_dir() + "/tmp_model"
-        keras.models.save_model(model, model_path, save_format="tf")
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model_path += ".keras"
+        keras.models.save_model(model, model_path, save_format=save_format)
         reloaded_model = keras.models.load_model(model_path)
         self.assertEqual(
             len(model.trainable_variables),
@@ -556,7 +565,7 @@ def test_keras_saving_subclass(self):
         ):
             self.assertAllEqual(src_v, loaded_v)
 
-    @parameterized.parameters("h5", "tf")
+    @parameterized.parameters("h5", "tf", "keras_v3")
     def test_keras_saving_functional(self, save_format):
         model = TestModel()
         query = keras.Input(shape=(40, 80))
@@ -565,6 +574,12 @@ def test_keras_saving_functional(self, save_format):
         )(query, query)
         model = keras.Model(inputs=query, outputs=output)
         model_path = self.get_temp_dir() + "/tmp_model"
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model_path += ".keras"
         keras.models.save_model(model, model_path, save_format=save_format)
         reloaded_model = keras.models.load_model(model_path)
         self.assertEqual(
diff --git a/keras/layers/convolutional/separable_conv2d.py b/keras/layers/convolutional/separable_conv2d.py
index 8290758b48c0..18e9ad49555c 100644
--- a/keras/layers/convolutional/separable_conv2d.py
+++ b/keras/layers/convolutional/separable_conv2d.py
@@ -185,13 +185,13 @@ def call(self, inputs):
             strides = (1,) + self.strides + (1,)
         else:
             strides = (1, 1) + self.strides
-        outputs = tf.compat.v1.nn.separable_conv2d(
+        outputs = tf.nn.separable_conv2d(
             inputs,
             self.depthwise_kernel,
             self.pointwise_kernel,
             strides=strides,
             padding=self.padding.upper(),
-            rate=self.dilation_rate,
+            dilations=self.dilation_rate,
             data_format=conv_utils.convert_data_format(
                 self.data_format, ndim=4
             ),
diff --git a/keras/layers/core/core_test.py b/keras/layers/core/core_test.py
index 7a869a367fce..345eb9e33c20 100644
--- a/keras/layers/core/core_test.py
+++ b/keras/layers/core/core_test.py
@@ -89,7 +89,7 @@ def test_dropout_partial_noise_shape(self):
         # Test that dropout mask is shared across second dim.
         self.assertAllClose(out_np[:, 0, :], out_np[:, 1, :])
 
-    def test_dropout_with_savemodel(self):
+    def test_dropout_with_saving(self):
         inputs = keras.Input(shape=(5, 10))
         layer = keras.layers.Dropout(0.5, force_generator=True)
         outputs = layer(inputs)
@@ -105,32 +105,52 @@ def test_dropout_with_savemodel(self):
         # Make sure the layer does dropout value when training
         self.assertNotAllClose(train, predict)
 
-        model.save(
-            os.path.join(self.get_temp_dir(), "savedmodel"), save_format="tf"
-        )
-        loaded_model = keras.models.load_model(
-            os.path.join(self.get_temp_dir(), "savedmodel")
-        )
-        predict2 = loaded_model(np.ones((20, 5, 10)))
-
-        self.assertAllClose(predict, predict2)
-        # Make sure the model dropout different value after loading
-        train2 = loaded_model(np.ones((20, 5, 10)), training=True)
-        self.assertNotAllClose(train, train2)
-        self.assertIsNotNone(loaded_model.layers[1]._random_generator)
-
-        # Also make sure the checkpoint doesn't contain any variable from the
-        # dropout layer, to keep the backward compatibility.
-        checkpoint = tf.train.Checkpoint(model)
-        save_path = checkpoint.save(
-            os.path.join(self.get_temp_dir(), "checkpoint")
-        )
-        checkpoint_var_names = [
-            name_value_tuple[0]
-            for name_value_tuple in tf.train.list_variables(save_path)
-        ]
-        for name in checkpoint_var_names:
-            self.assertNotIn("dropout", name)
+        with self.subTest("savedmodel"):
+            model.save(
+                os.path.join(self.get_temp_dir(), "savedmodel"),
+                save_format="tf",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "savedmodel")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model.save(os.path.join(self.get_temp_dir(), "model.keras"))
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "model.keras")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("checkpoint"):
+            # Also make sure the checkpoint doesn't contain any variable from
+            # the dropout layer, to keep the backward compatibility.
+            checkpoint = tf.train.Checkpoint(model)
+            save_path = checkpoint.save(
+                os.path.join(self.get_temp_dir(), "checkpoint")
+            )
+            checkpoint_var_names = [
+                name_value_tuple[0]
+                for name_value_tuple in tf.train.list_variables(save_path)
+            ]
+            for name in checkpoint_var_names:
+                self.assertNotIn("dropout", name)
 
 
 @test_combinations.run_all_keras_modes
diff --git a/keras/layers/normalization/group_normalization.py b/keras/layers/normalization/group_normalization.py
index 0a4c0cdde2ed..5d883b8fd260 100644
--- a/keras/layers/normalization/group_normalization.py
+++ b/keras/layers/normalization/group_normalization.py
@@ -50,18 +50,19 @@ class GroupNormalization(Layer):
     Args:
       groups: Integer, the number of groups for Group Normalization. Can be in
         the range [1, N] where N is the input dimension. The input dimension
-        must be divisible by the number of groups. Defaults to 32.
+        must be divisible by the number of groups. Defaults to `32`.
       axis: Integer or List/Tuple. The axis or axes to normalize across.
-        Typically this is the features axis/axes. The left-out axes are
-        typically the batch axis/axes. This argument defaults to `-1`, the last
-        dimension in the input.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
-        `beta` is ignored. Defaults to True.
+        `beta` is ignored. Defaults to `True`.
       scale: If True, multiply by `gamma`. If False, `gamma` is not used.
-        Defaults to True. When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling will be done by the next layer.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to `True`.
       beta_initializer: Initializer for the beta weight. Defaults to zeros.
       gamma_initializer: Initializer for the gamma weight. Defaults to ones.
       beta_regularizer: Optional regularizer for the beta weight. None by
diff --git a/keras/layers/normalization/layer_normalization.py b/keras/layers/normalization/layer_normalization.py
index 9a07c65b7bf0..0227bdb27630 100644
--- a/keras/layers/normalization/layer_normalization.py
+++ b/keras/layers/normalization/layer_normalization.py
@@ -120,16 +120,17 @@ class LayerNormalization(Layer):
 
     Args:
       axis: Integer or List/Tuple. The axis or axes to normalize across.
-        Typically this is the features axis/axes. The left-out axes are
-        typically the batch axis/axes. This argument defaults to `-1`, the last
-        dimension in the input.
+        Typically, this is the features axis/axes. The left-out axes are
+        typically the batch axis/axes. `-1` is the last dimension in the
+        input. Defaults to `-1`.
       epsilon: Small float added to variance to avoid dividing by zero. Defaults
         to 1e-3
       center: If True, add offset of `beta` to normalized tensor. If False,
-        `beta` is ignored. Defaults to True.
+        `beta` is ignored. Defaults to `True`.
       scale: If True, multiply by `gamma`. If False, `gamma` is not used.
-        Defaults to True. When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling will be done by the next layer.
+        When the next layer is linear (also e.g. `nn.relu`), this can be
+        disabled since the scaling will be done by the next layer.
+        Defaults to `True`.
       beta_initializer: Initializer for the beta weight. Defaults to zeros.
       gamma_initializer: Initializer for the gamma weight. Defaults to ones.
       beta_regularizer: Optional regularizer for the beta weight. None by
diff --git a/keras/layers/normalization/spectral_normalization_test.py b/keras/layers/normalization/spectral_normalization_test.py
index 8d673879cd67..555850291af3 100644
--- a/keras/layers/normalization/spectral_normalization_test.py
+++ b/keras/layers/normalization/spectral_normalization_test.py
@@ -51,12 +51,27 @@ def test_save_load_model(self):
         # initialize model
         model.predict(tf.random.uniform((2, 1)))
 
-        model.save("test.h5")
-        new_model = keras.models.load_model("test.h5")
+        with self.subTest("h5"):
+            model.save("test.h5")
+            new_model = keras.models.load_model("test.h5")
 
-        self.assertEqual(
-            model.layers[0].get_config(), new_model.layers[0].get_config()
-        )
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
+        with self.subTest("savedmodel"):
+            model.save("test")
+            new_model = keras.models.load_model("test")
+
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
+        with self.subTest("keras_v3"):
+            model.save("test.keras")
+            new_model = keras.models.load_model("test.keras")
+
+            self.assertEqual(
+                model.layers[0].get_config(), new_model.layers[0].get_config()
+            )
 
     @test_combinations.run_all_keras_modes
     def test_normalization(self):
diff --git a/keras/layers/normalization/unit_normalization.py b/keras/layers/normalization/unit_normalization.py
index 843ecb88c4b9..eb1746fdde15 100644
--- a/keras/layers/normalization/unit_normalization.py
+++ b/keras/layers/normalization/unit_normalization.py
@@ -40,9 +40,9 @@ class UnitNormalization(base_layer.Layer):
 
     Args:
       axis: Integer or list/tuple. The axis or axes to normalize across.
-        Typically this is the features axis or axes. The left-out axes are
-        typically the batch axis or axes. Defaults to `-1`, the last dimension
-        in the input.
+        Typically, this is the features axis or axes. The left-out axes are
+        typically the batch axis or axes. `-1` is the last dimension
+        in the input. Defaults to `-1`.
     """
 
     def __init__(self, axis=-1, **kwargs):
diff --git a/keras/layers/preprocessing/hashed_crossing_test.py b/keras/layers/preprocessing/hashed_crossing_test.py
index 948dda50c328..6fa5163fb784 100644
--- a/keras/layers/preprocessing/hashed_crossing_test.py
+++ b/keras/layers/preprocessing/hashed_crossing_test.py
@@ -154,7 +154,7 @@ def test_from_config(self):
             tf.sparse.to_dense(original_outputs),
         )
 
-    def test_saved_model_keras(self):
+    def test_saving_keras(self):
         string_in = keras.Input(shape=(1,), dtype=tf.string)
         int_in = keras.Input(shape=(1,), dtype=tf.int64)
         out = hashed_crossing.HashedCrossing(num_bins=10)((string_in, int_in))
@@ -167,17 +167,39 @@ def test_saved_model_keras(self):
         output_data = model((string_data, int_data))
         self.assertAllClose(output_data, expected_output)
 
-        # Save the model to disk.
-        output_path = os.path.join(self.get_temp_dir(), "saved_model")
-        model.save(output_path, save_format="tf")
-        loaded_model = keras.models.load_model(
-            output_path,
-            custom_objects={"HashedCrossing": hashed_crossing.HashedCrossing},
-        )
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(self.get_temp_dir(), "saved_model")
+            model.save(output_path, save_format="tf")
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={
+                    "HashedCrossing": hashed_crossing.HashedCrossing
+                },
+            )
+
+            # Validate correctness of the new model.
+            new_output_data = loaded_model((string_data, int_data))
+            self.assertAllClose(new_output_data, expected_output)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            # Save the model to disk.
+            output_path = os.path.join(self.get_temp_dir(), "model.keras")
+            model.save(output_path, save_format="keras_v3")
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={
+                    "HashedCrossing": hashed_crossing.HashedCrossing
+                },
+            )
 
-        # Validate correctness of the new model.
-        new_output_data = loaded_model((string_data, int_data))
-        self.assertAllClose(new_output_data, expected_output)
+            # Validate correctness of the new model.
+            new_output_data = loaded_model((string_data, int_data))
+            self.assertAllClose(new_output_data, expected_output)
 
 
 if __name__ == "__main__":
diff --git a/keras/layers/preprocessing/hashing_test.py b/keras/layers/preprocessing/hashing_test.py
index 76f20719f6ed..7bb20dc1eab8 100644
--- a/keras/layers/preprocessing/hashing_test.py
+++ b/keras/layers/preprocessing/hashing_test.py
@@ -414,6 +414,30 @@ def test_saved_model(self):
         new_output_data = loaded_model(input_data)
         self.assertAllClose(new_output_data, original_output_data)
 
+    @test_utils.run_v2_only
+    def test_save_keras_v3(self):
+        input_data = np.array(
+            ["omar", "stringer", "marlo", "wire", "skywalker"]
+        )
+
+        inputs = keras.Input(shape=(None,), dtype=tf.string)
+        outputs = hashing.Hashing(num_bins=100)(inputs)
+        model = keras.Model(inputs=inputs, outputs=outputs)
+
+        original_output_data = model(input_data)
+
+        # Save the model to disk.
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model.keras")
+        model.save(output_path, save_format="keras_v3")
+        loaded_model = keras.models.load_model(output_path)
+
+        # Ensure that the loaded model is unique (so that the save/load is real)
+        self.assertIsNot(model, loaded_model)
+
+        # Validate correctness of the new model.
+        new_output_data = loaded_model(input_data)
+        self.assertAllClose(new_output_data, original_output_data)
+
     @parameterized.named_parameters(
         (
             "list_input",
diff --git a/keras/layers/preprocessing/index_lookup_test.py b/keras/layers/preprocessing/index_lookup_test.py
index 91a8fc8b771e..ca488eb4c54e 100644
--- a/keras/layers/preprocessing/index_lookup_test.py
+++ b/keras/layers/preprocessing/index_lookup_test.py
@@ -2211,6 +2211,7 @@ def test_vocabulary_persistence_across_saving(self):
             ]
         )
         expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+        vocab_file = self._write_to_temp_file("temp", vocab_data)
 
         # Build and validate a golden model.
         input_data = keras.Input(shape=(None,), dtype=tf.string)
@@ -2220,32 +2221,57 @@ def test_vocabulary_persistence_across_saving(self):
             mask_token="",
             oov_token="[OOV]",
             vocabulary_dtype=tf.string,
+            vocabulary=vocab_file,
         )
-        layer.set_vocabulary(vocab_data)
         int_data = layer(input_data)
         model = keras.Model(inputs=input_data, outputs=int_data)
         output_dataset = model.predict(input_array)
         self.assertAllEqual(output_dataset, expected_output)
 
-        # Save the model to disk.
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-        model.save(output_path, save_format="tf")
+        with self.subTest("keras_v3"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
 
-        # Delete the session and graph to ensure that the loaded model is
-        # generated from scratch.
-        keras.backend.clear_session()
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
 
-        loaded_model = keras.models.load_model(
-            output_path,
-            custom_objects={"IndexLookup": index_lookup.IndexLookup},
-        )
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
 
-        # Ensure that the loaded model is unique (so that the save/load is real)
-        self.assertIsNot(model, loaded_model)
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
-        # Validate correctness of the new model.
-        new_output_dataset = loaded_model.predict(input_array)
-        self.assertAllEqual(new_output_dataset, expected_output)
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+            tf.io.gfile.remove(vocab_file)
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
     def test_vocabulary_persistence_file_across_cloning(self):
         vocab_data = ["earth", "wind", "and", "fire"]
@@ -2401,56 +2427,108 @@ def test_persistence_file_vocab_keras_save_keras_load(self):
         output_dataset = model.predict(input_array)
         self.assertAllEqual(output_dataset, expected_output)
 
-        # Save the model to disk.
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-        model.save(output_path, save_format="tf")
+        with self.subTest("keras_v3"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
 
-        # Delete the session and graph to ensure that the loaded model is
-        # generated from scratch.
-        keras.backend.clear_session()
-        tf.io.gfile.remove(vocab_file)
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
 
-        loaded_model = keras.models.load_model(
-            output_path,
-            custom_objects={"IndexLookup": index_lookup.IndexLookup},
-        )
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Try re-saving the layer. This simulates saving a layer
+            # contained at a hub Module.
+            input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+            output_2 = loaded_model(input_data_2)
+            model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+            new_output_dataset = model_2.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model_2.keras"
+            )
+            model_2.save(output_path, save_format="keras_v3")
 
-        # Ensure that the loaded model is unique (so that the save/load is real)
-        self.assertIsNot(model, loaded_model)
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
 
-        # Validate correctness of the new model.
-        new_output_dataset = loaded_model.predict(input_array)
-        self.assertAllEqual(new_output_dataset, expected_output)
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
 
-        # Try re-saving the layer. This simulates saving a layer contained at
-        # a hub Module.
-        input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
-        output_2 = loaded_model(input_data_2)
-        model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
-        new_output_dataset = model_2.predict(input_array)
-        self.assertAllEqual(new_output_dataset, expected_output)
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
-        # Save the model to disk.
-        output_path = os.path.join(
-            self.get_temp_dir(), "tf_keras_saved_model_2"
-        )
-        model_2.save(output_path, save_format="tf")
+        with self.subTest("saved_model"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
 
-        # Delete the session and graph to ensure that the loaded model is
-        # generated from scratch.
-        keras.backend.clear_session()
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+            tf.io.gfile.remove(vocab_file)
 
-        loaded_model = keras.models.load_model(
-            output_path,
-            custom_objects={"IndexLookup": index_lookup.IndexLookup},
-        )
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
 
-        # Ensure that the loaded model is unique (so that the save/load is real)
-        self.assertIsNot(model, loaded_model)
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Try re-saving the layer. This simulates saving a layer
+            # contained at a hub Module.
+            input_data_2 = keras.Input(shape=(None,), dtype=tf.string)
+            output_2 = loaded_model(input_data_2)
+            model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
+            new_output_dataset = model_2.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model_2"
+            )
+            model_2.save(output_path, save_format="tf")
 
-        # Validate correctness of the new model.
-        new_output_dataset = loaded_model.predict(input_array)
-        self.assertAllEqual(new_output_dataset, expected_output)
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            keras.backend.clear_session()
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IndexLookup": index_lookup.IndexLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
     def test_persistence_file_vocab_keras_save_keras_load_tf_save_tf_load(self):
         vocab_data = ["earth", "wind", "and", "fire"]
diff --git a/keras/layers/preprocessing/integer_lookup_test.py b/keras/layers/preprocessing/integer_lookup_test.py
index a99075db4d60..4a06475880cb 100644
--- a/keras/layers/preprocessing/integer_lookup_test.py
+++ b/keras/layers/preprocessing/integer_lookup_test.py
@@ -630,27 +630,56 @@ def test_vocabulary_persistence_across_saving(self):
         output_dataset = model.predict(input_array)
         self.assertAllEqual(output_dataset, expected_output)
 
-        # Save the model to disk.
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-        model.save(output_path, save_format="tf")
-
-        # Delete the session and graph to ensure that the loaded model is
-        # generated from scratch.
-        # TODO(b/149526183): Can't clear session when TF2 is disabled.
-        if tf.__internal__.tf2.enabled():
-            keras.backend.clear_session()
-
-        loaded_model = keras.models.load_model(
-            output_path,
-            custom_objects={"IntegerLookup": integer_lookup.IntegerLookup},
-        )
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_model.keras"
+            )
+            model.save(output_path, save_format="keras_v3")
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IntegerLookup": integer_lookup.IntegerLookup},
+            )
+
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
+
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
+
+        with self.subTest("savedmodel"):
+            # Save the model to disk.
+            output_path = os.path.join(
+                self.get_temp_dir(), "tf_keras_saved_model"
+            )
+            model.save(output_path, save_format="tf")
+
+            # Delete the session and graph to ensure that the loaded model is
+            # generated from scratch.
+            # TODO(b/149526183): Can't clear session when TF2 is disabled.
+            if tf.__internal__.tf2.enabled():
+                keras.backend.clear_session()
+
+            loaded_model = keras.models.load_model(
+                output_path,
+                custom_objects={"IntegerLookup": integer_lookup.IntegerLookup},
+            )
 
-        # Ensure that the loaded model is unique (so that the save/load is real)
-        self.assertIsNot(model, loaded_model)
+            # Ensure that the loaded model is unique
+            # (so that the save/load is real)
+            self.assertIsNot(model, loaded_model)
 
-        # Validate correctness of the new model.
-        new_output_dataset = loaded_model.predict(input_array)
-        self.assertAllEqual(new_output_dataset, expected_output)
+            # Validate correctness of the new model.
+            new_output_dataset = loaded_model.predict(input_array)
+            self.assertAllEqual(new_output_dataset, expected_output)
 
 
 if __name__ == "__main__":
diff --git a/keras/layers/preprocessing/normalization_test.py b/keras/layers/preprocessing/normalization_test.py
index c0ffdb26fa85..d948f34d38fa 100644
--- a/keras/layers/preprocessing/normalization_test.py
+++ b/keras/layers/preprocessing/normalization_test.py
@@ -392,7 +392,7 @@ def test_multiple_adapts(self):
         {"adapted": True},
         {"adapted": False},
     )
-    def test_saved_model_tf(self, adapted):
+    def test_saving_tf(self, adapted):
         input_data = [[0.0], [2.0], [0.0], [2.0]]
         expected_output = [[-1.0], [1.0], [-1.0], [1.0]]
 
@@ -422,10 +422,10 @@ def test_saved_model_tf(self, adapted):
         self.assertAllClose(new_output_data, expected_output)
 
     @parameterized.product(
-        save_format=["tf", "h5"],
+        save_format=["tf", "h5", "keras_v3"],
         adapt=[True, False],
     )
-    def test_saved_model_keras(self, save_format, adapt):
+    def test_saving_keras(self, save_format, adapt):
         input_data = [[0.0], [2.0], [0.0], [2.0]]
         expected_output = [[-1.0], [1.0], [-1.0], [1.0]]
 
@@ -443,7 +443,13 @@ def test_saved_model_keras(self, save_format, adapt):
         self.assertAllClose(output_data, expected_output)
 
         # Save the model to disk.
-        output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model")
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path += ".keras"
         model.save(output_path, save_format=save_format)
         loaded_model = keras.models.load_model(
             output_path, custom_objects={"Normalization": cls}
@@ -457,10 +463,10 @@ def test_saved_model_keras(self, save_format, adapt):
         self.assertAllClose(new_output_data, expected_output)
 
     @parameterized.product(
-        save_format=["tf", "h5"],
+        save_format=["tf", "h5", "keras_v3"],
         adapt=[True, False],
     )
-    def test_saved_model_keras_invert(self, save_format, adapt):
+    def test_saving_keras_invert(self, save_format, adapt):
         expected_output = [[0.0], [2.0], [0.0], [2.0]]
         input_data = [[-1.0], [1.0], [-1.0], [1.0]]
 
@@ -478,9 +484,13 @@ def test_saved_model_keras_invert(self, save_format, adapt):
         self.assertAllClose(output_data, expected_output)
 
         # Save the model to disk.
-        output_path = os.path.join(
-            self.get_temp_dir(), "tf_keras_saved_model_invert"
-        )
+        output_path = os.path.join(self.get_temp_dir(), "tf_keras_model_invert")
+        if save_format == "keras_v3":
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            output_path += ".keras"
         model.save(output_path, save_format=save_format)
         loaded_model = keras.models.load_model(
             output_path, custom_objects={"Normalization": cls}
diff --git a/keras/layers/regularization/dropout_test.py b/keras/layers/regularization/dropout_test.py
index bf53b4a44ad8..2239338b8af4 100644
--- a/keras/layers/regularization/dropout_test.py
+++ b/keras/layers/regularization/dropout_test.py
@@ -67,7 +67,7 @@ def test_dropout_with_zero_rate(self):
             rng_state_var, dropout._random_generator._generator._state_var
         )
 
-    def test_dropout_with_savemodel(self):
+    def test_dropout_with_saving(self):
         inputs = keras.Input(shape=(5, 10))
         layer = keras.layers.Dropout(0.5, force_generator=True)
         outputs = layer(inputs)
@@ -83,45 +83,68 @@ def test_dropout_with_savemodel(self):
         # Make sure the layer does dropout value when training
         self.assertNotAllClose(train, predict)
 
-        model.save(
-            os.path.join(self.get_temp_dir(), "savedmodel"), save_format="tf"
-        )
-        loaded_model = keras.models.load_model(
-            os.path.join(self.get_temp_dir(), "savedmodel")
-        )
-        predict2 = loaded_model(np.ones((20, 5, 10)))
-
-        self.assertAllClose(predict, predict2)
-        # Make sure the model dropout different value after loading
-        train2 = loaded_model(np.ones((20, 5, 10)), training=True)
-        self.assertNotAllClose(train, train2)
-        self.assertIsNotNone(loaded_model.layers[1]._random_generator)
-
-        # Also make sure the checkpoint doesn't contain any variable from the
-        # dropout layer, to keep the backward compatibility.
-        checkpoint = tf.train.Checkpoint(model)
-        save_path = checkpoint.save(
-            os.path.join(self.get_temp_dir(), "checkpoint")
-        )
-        checkpoint_var_names = [
-            name_value_tuple[0]
-            for name_value_tuple in tf.train.list_variables(save_path)
-        ]
-        for name in checkpoint_var_names:
-            self.assertNotIn("dropout", name)
-
-        # Make sure the checkpoint can be loaded
-        clone_model = keras.models.clone_model(model)
-        checkpoint = tf.train.Checkpoint(clone_model)
-        status = checkpoint.restore(
-            os.path.join(self.get_temp_dir(), "checkpoint-1")
-        )
-        self.assertTrue(status.assert_consumed())
-        self.assertTrue(status.assert_existing_objects_matched())
-        # Make sure the output is differnt from the original model, since
-        # the StateVar is not preserved.
-        train3 = clone_model(np.ones((20, 5, 10)), training=True)
-        self.assertNotAllClose(train3, train2)
+        with self.subTest("savedmodel"):
+            model.save(
+                os.path.join(self.get_temp_dir(), "savedmodel"),
+                save_format="tf",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "savedmodel")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("keras_v3"):
+            if not tf.__internal__.tf2.enabled():
+                self.skipTest(
+                    "TF2 must be enabled to use the new `.keras` saving."
+                )
+            model.save(
+                os.path.join(self.get_temp_dir(), "model.keras"),
+                save_format="keras_v3",
+            )
+            loaded_model = keras.models.load_model(
+                os.path.join(self.get_temp_dir(), "model.keras")
+            )
+            predict2 = loaded_model(np.ones((20, 5, 10)))
+
+            self.assertAllClose(predict, predict2)
+            # Make sure the model dropout different value after loading
+            train2 = loaded_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train, train2)
+            self.assertIsNotNone(loaded_model.layers[1]._random_generator)
+
+        with self.subTest("checkpoint"):
+            # Also make sure the checkpoint doesn't contain any variable from
+            # the dropout layer, to keep the backward compatibility.
+            checkpoint = tf.train.Checkpoint(model)
+            save_path = checkpoint.save(
+                os.path.join(self.get_temp_dir(), "checkpoint")
+            )
+            checkpoint_var_names = [
+                name_value_tuple[0]
+                for name_value_tuple in tf.train.list_variables(save_path)
+            ]
+            for name in checkpoint_var_names:
+                self.assertNotIn("dropout", name)
+
+            # Make sure the checkpoint can be loaded
+            clone_model = keras.models.clone_model(model)
+            checkpoint = tf.train.Checkpoint(clone_model)
+            status = checkpoint.restore(
+                os.path.join(self.get_temp_dir(), "checkpoint-1")
+            )
+            self.assertTrue(status.assert_consumed())
+            self.assertTrue(status.assert_existing_objects_matched())
+            # Make sure the output is differnt from the original model, since
+            # the StateVar is not preserved.
+            train3 = clone_model(np.ones((20, 5, 10)), training=True)
+            self.assertNotAllClose(train3, train2)
 
     @test_utils.run_v2_only
     def test_state_variable_name(self):
diff --git a/keras/layers/rnn/BUILD b/keras/layers/rnn/BUILD
index 69124a325d37..f0691dd2eecc 100644
--- a/keras/layers/rnn/BUILD
+++ b/keras/layers/rnn/BUILD
@@ -396,6 +396,9 @@ cuda_py_test(
     srcs = ["gru_lstm_test.py"],
     python_version = "PY3",
     shard_count = 2,
+    tags = [
+        "no_oss",  # TODO(b/277925387)
+    ],
     deps = [
         ":gru",
         ":lstm",
@@ -414,6 +417,9 @@ cuda_py_test(
     srcs = ["gru_test.py"],
     python_version = "PY3",
     shard_count = 12,
+    tags = [
+        "no_oss",  # TODO(b/277925387)
+    ],
     deps = [
         ":gru_lstm_utils",
         "//:expect_absl_installed",
diff --git a/keras/layers/serialization.py b/keras/layers/serialization.py
index fd0e6b0a6e58..e35761b5b273 100644
--- a/keras/layers/serialization.py
+++ b/keras/layers/serialization.py
@@ -50,6 +50,7 @@
 from keras.layers.rnn import cell_wrappers
 from keras.layers.rnn import gru
 from keras.layers.rnn import lstm
+from keras.metrics import base_metric
 from keras.saving import serialization_lib
 from keras.saving.legacy import serialization as legacy_serialization
 from keras.saving.legacy.saved_model import json_utils
@@ -208,6 +209,13 @@ def serialize(layer, use_legacy_format=False):
     pprint(tf.keras.layers.serialize(model))
     # prints the configuration of the model, as a dict.
     """
+    if isinstance(layer, base_metric.Metric):
+        raise ValueError(
+            f"Cannot serialize {layer} since it is a metric. "
+            "Please use the `keras.metrics.serialize()` and "
+            "`keras.metrics.deserialize()` APIs to serialize "
+            "and deserialize metrics."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(layer)
 
diff --git a/keras/layers/serialization_test.py b/keras/layers/serialization_test.py
index c457ccd621e3..688466be0b74 100644
--- a/keras/layers/serialization_test.py
+++ b/keras/layers/serialization_test.py
@@ -24,6 +24,7 @@
 from keras.layers.rnn import gru_v1
 from keras.layers.rnn import lstm
 from keras.layers.rnn import lstm_v1
+from keras.metrics import Mean
 from keras.testing_infra import test_combinations
 
 
@@ -191,6 +192,11 @@ def test_serialize_deserialize_gru(self, layer):
             self.assertIsInstance(new_layer, gru_v1.GRU)
             self.assertNotIsInstance(new_layer, gru.GRU)
 
+    def test_serialize_metric_throws_error(self):
+        metric = Mean()
+        with self.assertRaisesRegex(ValueError, "since it is a metric."):
+            _ = keras.layers.serialize(metric)
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/keras/losses.py b/keras/losses.py
index 178cfb863bc2..21841e2f5e74 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -2854,6 +2854,12 @@ def serialize(loss, use_legacy_format=False):
     Returns:
       Loss configuration dictionary.
     """
+    if not isinstance(loss, Loss):
+        warnings.warn(
+            "The `keras.losses.serialize()` API should only be used for "
+            "objects of type `keras.losses.Loss`. Found an instance of type "
+            f"{type(loss)}, which may lead to improper serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(loss)
     return serialize_keras_object(loss)
diff --git a/keras/metrics/__init__.py b/keras/metrics/__init__.py
index 8943a7a4f7c0..373ac99492ba 100644
--- a/keras/metrics/__init__.py
+++ b/keras/metrics/__init__.py
@@ -15,6 +15,7 @@
 """All Keras metrics."""
 
 # isort: off
+import warnings
 from tensorflow.python.util.tf_export import keras_export
 
 # Base classes and utilities
@@ -138,6 +139,12 @@ def serialize(metric, use_legacy_format=False):
     Returns:
       Metric configuration dictionary.
     """
+    if not isinstance(metric, Metric):
+        warnings.warn(
+            "The `keras.metrics.serialize()` API should only be used for "
+            "objects of type `keras.metrics.Metric`. Found an instance of "
+            f"type {type(metric)}, which may lead to improper serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(metric)
     return serialize_keras_object(metric)
diff --git a/keras/metrics/accuracy_metrics.py b/keras/metrics/accuracy_metrics.py
index 17cb1849e015..98e130a8efc7 100644
--- a/keras/metrics/accuracy_metrics.py
+++ b/keras/metrics/accuracy_metrics.py
@@ -261,7 +261,7 @@ class TopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
 
     Args:
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -307,7 +307,7 @@ class SparseTopKCategoricalAccuracy(base_metric.MeanMetricWrapper):
 
     Args:
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -482,7 +482,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
       y_true: The ground truth values.
       y_pred: The prediction values.
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
 
     Returns:
       Top K categorical accuracy value.
@@ -514,7 +514,7 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
       y_true: tensor of true targets.
       y_pred: tensor of predicted targets.
       k: (Optional) Number of top elements to look at for computing accuracy.
-        Defaults to 5.
+        Defaults to `5`.
 
     Returns:
       Sparse top K categorical accuracy value.
diff --git a/keras/metrics/confusion_metrics.py b/keras/metrics/confusion_metrics.py
index 6a1af4ea22fa..e5651156c1be 100644
--- a/keras/metrics/confusion_metrics.py
+++ b/keras/metrics/confusion_metrics.py
@@ -36,11 +36,11 @@ class _ConfusionMatrixConditionCount(base_metric.Metric):
 
     Args:
       confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+        value is generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
@@ -67,9 +67,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -113,13 +113,13 @@ class FalsePositives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -174,13 +174,13 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -235,13 +235,13 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -296,13 +296,13 @@ class TruePositives(_ConfusionMatrixConditionCount):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value, or a Python
+      thresholds: (Optional) A float value, or a Python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
         (i.e., above the threshold is `true`, below is `false`). If used with a
         loss function that sets `from_logits=True` (i.e. no sigmoid applied to
         predictions), `thresholds` should be set to 0. One metric value is
-        generated for each threshold value.
+        generated for each threshold value. Defaults to `0.5`.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
 
@@ -460,9 +460,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
             Will be cast to `bool`.
           y_pred: The predicted values. Each element must be in the range
             `[0, 1]`.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -606,9 +606,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
             Will be cast to `bool`.
           y_pred: The predicted values. Each element must be in the range
             `[0, 1]`.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -702,9 +702,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -798,8 +798,8 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
 
     Args:
       specificity: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given specificity.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given specificity. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -903,8 +903,8 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
 
     Args:
       sensitivity: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given sensitivity.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given sensitivity. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -999,8 +999,8 @@ class PrecisionAtRecall(SensitivitySpecificityBase):
 
     Args:
       recall: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given recall.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given recall. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -1090,8 +1090,8 @@ class RecallAtPrecision(SensitivitySpecificityBase):
 
     Args:
       precision: A scalar value in range `[0, 1]`.
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-        use for matching the given precision.
+      num_thresholds: (Optional) The number of thresholds to
+        use for matching the given precision. Defaults to `200`.
       class_id: (Optional) Integer class ID for which we want binary metrics.
         This must be in the half-open interval `[0, num_classes)`, where
         `num_classes` is the last dimension of predictions.
@@ -1209,8 +1209,9 @@ class AUC(base_metric.Metric):
     Use `sample_weight` of 0 to mask values.
 
     Args:
-      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+      num_thresholds: (Optional) The number of thresholds to
         use when discretizing the roc curve. Values must be > 1.
+        Defaults to `200`.
       curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
         [default] or 'PR' for the Precision-Recall-curve.
       summation_method: (Optional) Specifies the [Riemann summation method](
@@ -1400,9 +1401,9 @@ def _build(self, shape):
         if self.multi_label:
             if shape.ndims != 2:
                 raise ValueError(
-                    "`y_true` must have rank 2 when `multi_label=True`. "
+                    "`y_pred` must have rank 2 when `multi_label=True`. "
                     f"Found rank {shape.ndims}. "
-                    f"Full shape received for `y_true`: {shape}"
+                    f"Full shape received for `y_pred`: {shape}"
                 )
             self._num_labels = shape[1]
             variable_shape = tf.TensorShape(
@@ -1442,9 +1443,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
diff --git a/keras/metrics/iou_metrics.py b/keras/metrics/iou_metrics.py
index 83aac5b94a18..377ef8858f96 100644
--- a/keras/metrics/iou_metrics.py
+++ b/keras/metrics/iou_metrics.py
@@ -67,7 +67,8 @@ class _IoUBase(base_metric.Metric):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
     """
 
     def __init__(
@@ -100,9 +101,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -197,7 +198,8 @@ class IoU(_IoUBase):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) -1 is the dimension containing the logits.
+        Defaults to `-1`.
 
     Standalone usage:
 
@@ -405,9 +407,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -465,7 +467,7 @@ class MeanIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using integers or
         dense floating point vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
 
     Standalone usage:
 
@@ -581,7 +583,7 @@ class OneHotIoU(IoU):
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
 
     Standalone usage:
 
@@ -695,7 +697,7 @@ class apply.
       sparse_y_pred: Whether predictions are encoded using natural numbers or
         probability distribution vectors. If `False`, the `tf.argmax` function
         will be used to determine each sample's most likely associated label.
-      axis: (Optional) Defaults to -1. The dimension containing the logits.
+      axis: (Optional) The dimension containing the logits. Defaults to `-1`.
 
     Standalone usage:
 
diff --git a/keras/metrics/probabilistic_metrics.py b/keras/metrics/probabilistic_metrics.py
index ce4eb419ec20..bf5b0aa01e12 100644
--- a/keras/metrics/probabilistic_metrics.py
+++ b/keras/metrics/probabilistic_metrics.py
@@ -183,8 +183,8 @@ class CategoricalCrossentropy(base_metric.MeanMetricWrapper):
         smoothed, meaning the confidence on label values are relaxed. e.g.
         `label_smoothing=0.2` means that we will use a value of `0.1` for label
         `0` and `0.9` for label `1`"
-      axis: (Optional) Defaults to -1. The dimension along which entropy is
-        computed.
+      axis: (Optional) -1 is the dimension along which entropy is
+        computed. Defaults to `-1`.
 
     Standalone usage:
 
@@ -261,8 +261,8 @@ class SparseCategoricalCrossentropy(base_metric.MeanMetricWrapper):
         metric computation. This is useful, for example, in segmentation
         problems featuring a "void" class (commonly -1 or 255) in segmentation
         maps. By default (`ignore_class=None`), all classes are considered.
-      axis: (Optional) Defaults to -1. The dimension along which entropy is
-        computed.
+      axis: (Optional) The dimension along which entropy is
+        computed. Defaults to `-1`.
 
     Standalone usage:
 
diff --git a/keras/metrics/regression_metrics.py b/keras/metrics/regression_metrics.py
index 637706432d54..4e2528ca5cfc 100644
--- a/keras/metrics/regression_metrics.py
+++ b/keras/metrics/regression_metrics.py
@@ -84,9 +84,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -138,8 +138,8 @@ class CosineSimilarity(base_metric.MeanMetricWrapper):
     Args:
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
-      axis: (Optional) Defaults to -1. The dimension along which the cosine
-        similarity is computed.
+      axis: (Optional) The dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
 
     Standalone usage:
 
@@ -357,9 +357,9 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         Args:
           y_true: The ground truth values.
           y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1. Can
+          sample_weight: Optional weighting of each example. Can
             be a `Tensor` whose rank is either 0, or the same rank as `y_true`,
-            and must be broadcastable to `y_true`.
+            and must be broadcastable to `y_true`. Defaults to `1`.
 
         Returns:
           Update op.
@@ -443,7 +443,8 @@ class R2Score(base_metric.Metric):
             `None` (no aggregation), `"uniform_average"`,
             `"variance_weighted_average"`.
         num_regressors: Number of independent regressors used
-            ("Adjusted R2" score). Defaults to 0 (standard R2 score).
+            ("Adjusted R2" score). 0 is the standard R2 score.
+            Defaults to `0`.
         name: Optional. string name of the metric instance.
         dtype: Optional. data type of the metric result.
 
@@ -614,8 +615,8 @@ def cosine_similarity(y_true, y_pred, axis=-1):
     Args:
       y_true: The ground truth values.
       y_pred: The prediction values.
-      axis: (Optional) Defaults to -1. The dimension along which the cosine
-        similarity is computed.
+      axis: (Optional) -1 is the dimension along which the cosine
+        similarity is computed. Defaults to `-1`.
 
     Returns:
       Cosine similarity value.
diff --git a/keras/mixed_precision/loss_scale_optimizer.py b/keras/mixed_precision/loss_scale_optimizer.py
index ab7105c816ec..e563ca264631 100644
--- a/keras/mixed_precision/loss_scale_optimizer.py
+++ b/keras/mixed_precision/loss_scale_optimizer.py
@@ -1264,6 +1264,12 @@ def compute_gradients(self, loss, var_list, tape=None):
     def apply_gradients(
         self, grads_and_vars, skip_gradients_aggregation=False, **kwargs
     ):
+        grads_and_vars = list(grads_and_vars)
+        grads, trainable_variables = zip(*grads_and_vars)
+        with tf.init_scope():
+            # Lift variable creation to init scope to avoid environment
+            # issues.
+            self.build(trainable_variables)
         if tf.distribute.in_cross_replica_context():
             raise ValueError(
                 "apply_gradients() must be called in a replica context."
diff --git a/keras/models/sharpness_aware_minimization_test.py b/keras/models/sharpness_aware_minimization_test.py
index 34eb06dc0baf..7571f179b5b0 100644
--- a/keras/models/sharpness_aware_minimization_test.py
+++ b/keras/models/sharpness_aware_minimization_test.py
@@ -109,12 +109,21 @@ def test_save_sam(self):
 
         sam_model.fit(data, label)
 
-        path = os.path.join(self.get_temp_dir(), "model")
-        sam_model.save(path)
-        loaded_sam_model = keras.models.load_model(path)
-        loaded_sam_model.load_weights(path)
+        with self.subTest("savedmodel"):
+            path = os.path.join(self.get_temp_dir(), "model")
+            sam_model.save(path)
+            loaded_sam_model = keras.models.load_model(path)
+            loaded_sam_model.load_weights(path)
 
-        self.assertAllClose(sam_model(data), loaded_sam_model(data))
+            self.assertAllClose(sam_model(data), loaded_sam_model(data))
+
+        with self.subTest("keras_v3"):
+            path = os.path.join(self.get_temp_dir(), "model.keras")
+            sam_model.save(path)
+            loaded_sam_model = keras.models.load_model(path)
+            loaded_sam_model.load_weights(path)
+
+            self.assertAllClose(sam_model(data), loaded_sam_model(data))
 
     def test_checkpoint_sam(self):
         model = keras.Sequential(
diff --git a/keras/optimizers/__init__.py b/keras/optimizers/__init__.py
index 8a90757ff3ea..0a8e137c1a88 100644
--- a/keras/optimizers/__init__.py
+++ b/keras/optimizers/__init__.py
@@ -20,6 +20,7 @@
 # Imports needed for deserialization.
 
 import platform
+import warnings
 
 import tensorflow.compat.v2 as tf
 from absl import logging
@@ -86,6 +87,20 @@ def serialize(optimizer, use_legacy_format=False):
     Returns:
       Python dict which contains the configuration of the input optimizer.
     """
+    if not isinstance(
+        optimizer,
+        (
+            base_optimizer.Optimizer,
+            Optimizer,
+            base_optimizer_legacy.OptimizerV2,
+        ),
+    ):
+        warnings.warn(
+            "The `keras.optimizers.serialize()` API should only be used for "
+            "objects of type `keras.optimizers.Optimizer`. Found an instance "
+            f"of type {type(optimizer)}, which may lead to improper "
+            "serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(optimizer)
     return serialize_keras_object(optimizer)
diff --git a/keras/optimizers/optimizer.py b/keras/optimizers/optimizer.py
index 4c5b0b2b9d45..a9b758e1f642 100644
--- a/keras/optimizers/optimizer.py
+++ b/keras/optimizers/optimizer.py
@@ -685,31 +685,21 @@ def _internal_apply_gradients(self, grads_and_vars):
     def _update_model_variables_moving_average(self, var_list):
         """Update the stored moving average using the latest value."""
         if self.use_ema:
-            for var, average in zip(
-                var_list, self._model_variables_moving_average
-            ):
+            for var in var_list:
+                average = self._model_variables_moving_average[
+                    self._index_dict[self._var_key(var)]
+                ]
                 average.assign(
                     self.ema_momentum * average + (1 - self.ema_momentum) * var
                 )
 
     def _overwrite_model_variables_with_average_value(self, var_list):
         """Overwrite model variables with its moving average."""
-        if len(var_list) != len(self._model_variables_moving_average):
-            raise ValueError(
-                f"The length of model variables ({len(var_list)}) to "
-                "override does not match the length of model variables "
-                "stored in the optimizer "
-                f"({len(self._model_variables_moving_average)}). Please "
-                "check if the optimizer was called on your model."
-            )
-        self._overwrite_model_variables_with_average_value_helper(var_list)
-
-    def _overwrite_model_variables_with_average_value_helper(self, var_list):
-        """Helper function that overwrites model variables."""
-        for var, average_var in zip(
-            var_list, self._model_variables_moving_average
-        ):
-            var.assign(average_var)
+        for var in var_list:
+            average = self._model_variables_moving_average[
+                self._index_dict[self._var_key(var)]
+            ]
+            var.assign(average)
 
     def finalize_variable_values(self, var_list):
         """Set the final value of model's trainable variables.
@@ -1263,8 +1253,8 @@ def _internal_apply_gradients(self, grads_and_vars):
             grads_and_vars,
         )
 
-    def _overwrite_model_variables_with_average_value_helper(self, var_list):
-        """Helper function to _overwrite_model_variables_with_average_value.
+    def _overwrite_model_variables_with_average_value(self, var_list):
+        """Overwrite model variables with their moving average values.
 
         This function overwrites variables on each device.
         Args:
@@ -1272,17 +1262,16 @@ def _overwrite_model_variables_with_average_value_helper(self, var_list):
         """
         if self._mesh or self._run_with_dtensor:
             # Skip any usage of strategy logic for DTensor
-            super()._overwrite_model_variables_with_average_value_helper(
-                var_list
-            )
+            super()._overwrite_model_variables_with_average_value(var_list)
 
         strategy = self._distribution_strategy
         # Override model variable by the stored average value on all devices.
-        for var, average_var in zip(
-            var_list, self._model_variables_moving_average
-        ):
+        for var in var_list:
+            average = self._model_variables_moving_average[
+                self._index_dict[self._var_key(var)]
+            ]
             strategy.extended.update(
-                var, lambda a, b: a.assign(b), args=(average_var,)
+                var, lambda a, b: a.assign(b), args=(average,)
             )
 
     def _build_learning_rate(self, learning_rate):
@@ -1330,9 +1319,10 @@ def update_average(average, var):
                     self.ema_momentum * average + (1 - self.ema_momentum) * var
                 )
 
-            for var, average in zip(
-                var_list, self._model_variables_moving_average
-            ):
+            for var in var_list:
+                average = self._model_variables_moving_average[
+                    self._index_dict[self._var_key(var)]
+                ]
                 self._distribution_strategy.extended.update(
                     average, update_average, args=(var,), group=False
                 )
diff --git a/keras/optimizers/optimizer_test.py b/keras/optimizers/optimizer_test.py
index 7e47b4a4793e..f501038a2cd1 100644
--- a/keras/optimizers/optimizer_test.py
+++ b/keras/optimizers/optimizer_test.py
@@ -337,22 +337,33 @@ def testMovingAverageOptimizer(self):
             ema_overwrite_frequency=3,
         )
 
-        var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
+        # `var2` does not produce gradients.
+        var1, var2, var3 = tf.Variable(2.0), tf.Variable(2.0), tf.Variable(2.0)
         with tf.GradientTape() as tape:
-            loss = var1 + var2
-        grads = tape.gradient(loss, [var1, var2])
-        # First iteration: [var1, var2] = [1.0, 1.0]
-        optimizer.apply_gradients(zip(grads, [var1, var2]))
-        self.assertAllEqual([var1.numpy(), var2.numpy()], [1.0, 1.0])
+            loss = var1 + var3
+        grads = tape.gradient(loss, [var1, var2, var3])
+        # First iteration: [var1, var2, var3] = [1.0, 2.0, 1.0]
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [1.0, 2.0, 1.0],
+        )
 
-        # Second iteration: [var1, var2] = [0.0, 0.0]
-        optimizer.apply_gradients(zip(grads, [var1, var2]))
-        self.assertAllEqual([var1.numpy(), var2.numpy()], [0.0, 0.0])
+        # Second iteration: [var1, var2, var3] = [0.0, 2.0, 0.0]
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [0.0, 2.0, 0.0],
+        )
 
-        # Third iteration, without EMA, we should see [var1, var2] = [-1.0,
-        # -1.0], but overwriting results in [var1, var2] = [-0.125, -0.125].
-        optimizer.apply_gradients(zip(grads, [var1, var2]))
-        self.assertAllEqual([var1.numpy(), var2.numpy()], [-0.125, -0.125])
+        # Third iteration, without EMA, we should see [var1, var2, var3] =
+        # [-1.0, 2.0 -1.0], but overwriting results in [var1, var2] =
+        # [-0.125, 2.0, -0.125].
+        optimizer.apply_gradients(zip(grads, [var1, var2, var3]))
+        self.assertAllEqual(
+            [var1.numpy(), var2.numpy(), var3.numpy()],
+            [-0.125, 2.0, -0.125],
+        )
 
     def testGetAndFromConfig(self):
         class CustomLRSchedule(learning_rate_schedule.LearningRateSchedule):
@@ -527,6 +538,17 @@ def testSaveAndLoadOptimizerWithModel(self, optimizer_fn):
         loaded_optimizer.build(loaded_model.trainable_variables)
         self.assertAllClose(optimizer.variables, loaded_optimizer.variables)
 
+        # Save in `.keras` format.
+        path = os.path.join(self.get_temp_dir(), "model.keras")
+        model.save(path)
+        loaded_model = keras.models.load_model(path)
+        loaded_model.load_weights(path)
+        loaded_optimizer = loaded_model.optimizer
+        self.assertEqual(type(optimizer), type(loaded_optimizer))
+        self.assertEqual(loaded_optimizer.learning_rate, 0.002)
+        self.assertEqual(loaded_optimizer.clipnorm, 0.1)
+        self.assertAllClose(optimizer.variables, loaded_optimizer.variables)
+
     @parameterized.product(optimizer_fn=OPTIMIZER_FN)
     def testSparseGradientsWorkAsExpected(self, optimizer_fn):
         optimizer_1 = optimizer_fn()
diff --git a/keras/regularizers.py b/keras/regularizers.py
index f50fc0a6c8bf..f1161976e6eb 100644
--- a/keras/regularizers.py
+++ b/keras/regularizers.py
@@ -16,6 +16,7 @@
 
 
 import math
+import warnings
 
 import tensorflow.compat.v2 as tf
 
@@ -419,6 +420,13 @@ def l1_l2(l1=0.01, l2=0.01):
 
 @keras_export("keras.regularizers.serialize")
 def serialize(regularizer, use_legacy_format=False):
+    if not isinstance(regularizer, Regularizer):
+        warnings.warn(
+            "The `keras.regularizers.serialize()` API should only be used for "
+            "objects of type `keras.regularizers.Regularizer`. Found an "
+            f"instance of type {type(regularizer)}, which may lead to improper "
+            "serialization."
+        )
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(regularizer)
     return serialize_keras_object(regularizer)
diff --git a/keras/saving/legacy/saved_model/json_utils.py b/keras/saving/legacy/saved_model/json_utils.py
index 6d133bb1c41f..05b0e285be75 100644
--- a/keras/saving/legacy/saved_model/json_utils.py
+++ b/keras/saving/legacy/saved_model/json_utils.py
@@ -95,8 +95,8 @@ def _decode_helper(
 
     Args:
       obj: A decoded dictionary that may represent an object.
-      deserialize: Boolean, defaults to False. When True, deserializes any Keras
-        objects found in `obj`.
+      deserialize: Boolean. When True, deserializes any Keras
+        objects found in `obj`. Defaults to `False`.
       module_objects: A dictionary of built-in objects to look the name up in.
         Generally, `module_objects` is provided by midlevel library
         implementers.
diff --git a/keras/saving/legacy/saved_model/save.py b/keras/saving/legacy/saved_model/save.py
index 601f4c089ab4..9126275cf3b3 100644
--- a/keras/saving/legacy/saved_model/save.py
+++ b/keras/saving/legacy/saved_model/save.py
@@ -64,9 +64,9 @@ def save(
       save_traces: (only applies to SavedModel format) When enabled, the
         SavedModel will store the function traces for each layer. This
         can be disabled, so that only the configs of each layer are stored.
-        Defaults to `True`. Disabling this will decrease serialization time
-        and reduce file size, but it requires that all custom layers/models
-        implement a `get_config()` method.
+        Disabling this will decrease serialization time and file size, but
+        it requires that all custom layers/models implement a
+        `get_config()` method. Defaults to `True`.
 
     Raises:
       ValueError: if the model's inputs have not been defined.
diff --git a/keras/saving/saving_lib.py b/keras/saving/saving_lib.py
index 3b279d8d4d2f..6b98946f4229 100644
--- a/keras/saving/saving_lib.py
+++ b/keras/saving/saving_lib.py
@@ -42,6 +42,10 @@
 except ImportError:
     h5py = None
 
+keras_saving_gauge = tf.__internal__.monitoring.BoolGauge(
+    "/tensorflow/api/keras/saving", "keras saving usage", "method"
+)
+
 # isort: off
 
 _CONFIG_FILENAME = "config.json"
@@ -127,6 +131,10 @@ def save_model(model, filepath, weights_format="h5"):
     container (list, tuple, or dict), and the container is referenced via a
     layer attribute.
     """
+
+    # API usage tracking for Keras V3 saving
+    keras_saving_gauge.get_cell("save_model_v3").set(True)
+
     filepath = str(filepath)
     if not filepath.endswith(".keras"):
         raise ValueError(
@@ -286,6 +294,10 @@ def save_weights_only(model, filepath):
     """
     # TODO: if h5 filepath is remote, create the file in a temporary directory
     # then upload it
+
+    # API usage tracking for Keras V3 saving
+    keras_saving_gauge.get_cell("save_weights_v3").set(True)
+
     filepath = str(filepath)
     if not filepath.endswith(".weights.h5"):
         raise ValueError(
diff --git a/keras/tools/pip_package/setup.py b/keras/tools/pip_package/setup.py
index 490ff0d8228a..f55b12f8098d 100644
--- a/keras/tools/pip_package/setup.py
+++ b/keras/tools/pip_package/setup.py
@@ -31,7 +31,7 @@
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = "2.13.0"
+_VERSION = "2.14.0"
 
 REQUIRED_PACKAGES = [
     # We depend on TensorFlow's declared pip dependencies.
diff --git a/keras/utils/audio_dataset.py b/keras/utils/audio_dataset.py
index 52afba42780d..60d2ec422769 100644
--- a/keras/utils/audio_dataset.py
+++ b/keras/utils/audio_dataset.py
@@ -67,61 +67,70 @@ def audio_dataset_from_directory(
     Only `.wav` files are supported at this time.
 
     Args:
-      directory: Directory where the data is located. If `labels` is "inferred",
-        it should contain subdirectories, each containing audio files for a
-        class. Otherwise, the directory structure is ignored.
-      labels: Either "inferred" (labels are generated from the directory
-        structure), None (no labels), or a list/tuple of integer labels of the
-        same size as the number of audio files found in the directory. Labels
-        should be sorted according to the alphanumeric order of the audio file
-        paths (obtained via `os.walk(directory)` in Python).
-      label_mode: String describing the encoding of `labels`. Options are:
-          - 'int': means that the labels are encoded as integers (e.g. for
-            `sparse_categorical_crossentropy` loss). - 'categorical' means that
-            the labels are encoded as a categorical vector (e.g. for
-            `categorical_crossentropy` loss). - 'binary' means that the labels
-            (there can be only 2) are encoded as `float32` scalars with values 0
-            or 1 (e.g. for `binary_crossentropy`). - None (no labels).
-      class_names: Only valid if "labels" is "inferred". This is the explicit
-        list of class names (must match names of subdirectories). Used to
-        control the order of the classes (otherwise alphanumerical order is
-        used).
-      batch_size: Size of the batches of data. Default: 32. If `None`, the data
-        will not be batched (the dataset will yield individual samples).
-      sampling_rate: Audio sampling rate (in samples per second).
-      output_sequence_length: Maximum length of an audio sequence. Audio files
-        longer than this will be truncated to `output_sequence_length`. If set
-        to `None`, then all sequences in the same batch will be padded to the
-        length of the longest sequence in the batch.
-      ragged: Whether to return a Ragged dataset (where each sequence has its
-        own length). Default: False.
-      shuffle: Whether to shuffle the data. Default: True. If set to False,
-        sorts the data in alphanumeric order.
-      seed: Optional random seed for shuffling and transformations.
-      validation_split: Optional float between 0 and 1, fraction of data to
-        reserve for validation.
-      subset: Subset of the data to return. One of "training", "validation" or
-        "both". Only used if `validation_split` is set.
-      follow_links: Whether to visits subdirectories pointed to by symlinks.
-        Defaults to `False`.
+        directory: Directory where the data is located.
+            If `labels` is `"inferred"`, it should contain subdirectories,
+            each containing audio files for a class. Otherwise, the directory
+            structure is ignored.
+        labels: Either "inferred" (labels are generated from the directory
+            structure), `None` (no labels), or a list/tuple of integer labels
+            of the same size as the number of audio files found in
+            the directory. Labels should be sorted according to the
+            alphanumeric order of the audio file paths
+            (obtained via `os.walk(directory)` in Python).
+        label_mode: String describing the encoding of `labels`. Options are:
+            - `"int"`: means that the labels are encoded as integers (e.g. for
+              `sparse_categorical_crossentropy` loss).
+            - `"categorical"` means that the labels are encoded as a categorical
+              vector (e.g. for `categorical_crossentropy` loss)
+            - `"binary"` means that the labels (there can be only 2)
+              are encoded as `float32` scalars with values 0
+              or 1 (e.g. for `binary_crossentropy`).
+            - `None` (no labels).
+        class_names: Only valid if "labels" is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        batch_size: Size of the batches of data. Default: 32. If `None`,
+            the data will not be batched
+            (the dataset will yield individual samples).
+        sampling_rate: Audio sampling rate (in samples per second).
+        output_sequence_length: Maximum length of an audio sequence. Audio files
+            longer than this will be truncated to `output_sequence_length`.
+            If set to `None`, then all sequences in the same batch will
+            be padded to the
+            length of the longest sequence in the batch.
+        ragged: Whether to return a Ragged dataset (where each sequence has its
+            own length). Defaults to `False`.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        validation_split: Optional float between 0 and 1, fraction of data to
+            reserve for validation.
+        subset: Subset of the data to return. One of `"training"`,
+            `"validation"` or `"both"`. Only used if `validation_split` is set.
+        follow_links: Whether to visits subdirectories pointed to by symlinks.
+            Defaults to `False`.
 
     Returns:
-      A `tf.data.Dataset` object.
-        - If `label_mode` is None, it yields `string` tensors of shape
-          `(batch_size,)`, containing the contents of a batch of audio files.
-        - Otherwise, it yields a tuple `(audio, labels)`, where `audio`
-          has shape `(batch_size, sequence_length, num_channels)` and `labels`
-          follows the format described
-          below.
+
+    A `tf.data.Dataset` object.
+
+    - If `label_mode` is `None`, it yields `string` tensors of shape
+      `(batch_size,)`, containing the contents of a batch of audio files.
+    - Otherwise, it yields a tuple `(audio, labels)`, where `audio`
+      has shape `(batch_size, sequence_length, num_channels)` and `labels`
+      follows the format described
+      below.
 
     Rules regarding labels format:
-      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-        `(batch_size,)`.
-      - if `label_mode` is `binary`, the labels are a `float32` tensor of
-        1s and 0s of shape `(batch_size, 1)`.
-      - if `label_mode` is `categorical`, the labels are a `float32` tensor
-        of shape `(batch_size, num_classes)`, representing a one-hot
-        encoding of the class index.
+
+    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+      `(batch_size,)`.
+    - if `label_mode` is `binary`, the labels are a `float32` tensor of
+      1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `categorical`, the labels are a `float32` tensor
+      of shape `(batch_size, num_classes)`, representing a one-hot
+      encoding of the class index.
     """
     if labels not in ("inferred", None):
         if not isinstance(labels, (list, tuple)):
diff --git a/keras/utils/generic_utils_test.py b/keras/utils/generic_utils_test.py
index a580513a3163..4ed6242bda61 100644
--- a/keras/utils/generic_utils_test.py
+++ b/keras/utils/generic_utils_test.py
@@ -25,6 +25,7 @@
 import keras
 from keras.saving import serialization_lib
 from keras.saving.legacy import serialization
+from keras.testing_infra import test_utils
 from keras.utils import generic_utils
 from keras.utils import io_utils
 
@@ -324,6 +325,30 @@ class MaybeSharedObject:
     pass
 
 
+class CustomModelX(keras.Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dense1 = keras.layers.Dense(1)
+        self.train_step_message = "This is my training step"
+
+    def call(self, inputs):
+        return self.dense1(inputs)
+
+    def train_step(self, data):
+        tf.print(self.train_step_message)
+        x, y = data
+        with tf.GradientTape() as tape:
+            y_pred = self(x)
+            loss = self.compiled_loss(y, y_pred)
+
+        gradients = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        return {}
+
+    def func_that_returns_one(self):
+        return 1
+
+
 class SharedObjectScopeTest(tf.test.TestCase):
     def test_shared_object_saving_scope_single_object_doesnt_export_id(self):
         with serialization.SharedObjectSavingScope() as scope:
@@ -375,33 +400,38 @@ def test_nested_shared_object_saving_scopes(self):
             self.assertIsNotNone(scope_1.get_config(my_obj))
         self.assertIsNone(serialization._shared_object_saving_scope())
 
-    def test_custom_object_scope_correct_class(self):
-        train_step_message = "This is my training step"
+    def test_custom_object_scope_correct_class_saved_model(self):
         temp_dir = os.path.join(self.get_temp_dir(), "my_model")
 
-        class CustomModelX(keras.Model):
-            def __init__(self, *args, **kwargs):
-                super().__init__(*args, **kwargs)
-                self.dense1 = keras.layers.Dense(1)
+        subclassed_model = CustomModelX()
+        subclassed_model.compile(optimizer="adam", loss="mse")
 
-            def call(self, inputs):
-                return self.dense1(inputs)
+        x = np.random.random((100, 32))
+        y = np.random.random((100, 1))
+        subclassed_model.fit(x, y, epochs=1)
 
-            def train_step(self, data):
-                tf.print(train_step_message)
-                x, y = data
-                with tf.GradientTape() as tape:
-                    y_pred = self(x)
-                    loss = self.compiled_loss(y, y_pred)
+        subclassed_model.save(temp_dir, save_format="tf")
 
-                gradients = tape.gradient(loss, self.trainable_variables)
-                self.optimizer.apply_gradients(
-                    zip(gradients, self.trainable_variables)
-                )
-                return {}
+        with keras.utils.custom_object_scope({"CustomModelX": CustomModelX}):
+            loaded_model = keras.models.load_model(temp_dir)
 
-            def func_that_returns_one(self):
-                return 1
+        io_utils.enable_interactive_logging()
+        # `tf.print` writes to stderr.
+        with self.captureWritesToStream(sys.stderr) as printed:
+            loaded_model.fit(x, y, epochs=1)
+            if tf.__internal__.tf2.enabled():
+                # `tf.print` message is only available in stderr in TF2.
+                # Check that custom `train_step` is used.
+                self.assertRegex(printed.contents(), "This is my training step")
+
+        # Check that the custom class does get used.
+        self.assertIsInstance(loaded_model, CustomModelX)
+        # Check that the custom method is available.
+        self.assertEqual(loaded_model.func_that_returns_one(), 1)
+
+    @test_utils.run_v2_only
+    def test_custom_object_scope_correct_class_keras_v3(self):
+        temp_dir = os.path.join(self.get_temp_dir(), "my_model.keras")
 
         subclassed_model = CustomModelX()
         subclassed_model.compile(optimizer="adam", loss="mse")
@@ -409,7 +439,8 @@ def func_that_returns_one(self):
         x = np.random.random((100, 32))
         y = np.random.random((100, 1))
         subclassed_model.fit(x, y, epochs=1)
-        subclassed_model.save(temp_dir, save_format="tf")
+
+        subclassed_model.save(temp_dir, save_format="keras_v3")
 
         with keras.utils.custom_object_scope({"CustomModelX": CustomModelX}):
             loaded_model = keras.models.load_model(temp_dir)
@@ -419,9 +450,9 @@ def func_that_returns_one(self):
         with self.captureWritesToStream(sys.stderr) as printed:
             loaded_model.fit(x, y, epochs=1)
             if tf.__internal__.tf2.enabled():
-                # `tf.print` message is only available in stderr in TF2. Check
-                # that custom `train_step` is used.
-                self.assertRegex(printed.contents(), train_step_message)
+                # `tf.print` message is only available in stderr in TF2.
+                # Check that custom `train_step` is used.
+                self.assertRegex(printed.contents(), "This is my training step")
 
         # Check that the custom class does get used.
         self.assertIsInstance(loaded_model, CustomModelX)
diff --git a/keras/utils/image_dataset.py b/keras/utils/image_dataset.py
index 98876a650197..8fd622af41cb 100644
--- a/keras/utils/image_dataset.py
+++ b/keras/utils/image_dataset.py
@@ -68,96 +68,99 @@ def image_dataset_from_directory(
     images from the subdirectories `class_a` and `class_b`, together with labels
     0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
 
-    Supported image formats: jpeg, png, bmp, gif.
+    Supported image formats: `.jpeg`, `.jpg`, `.png`, `.bmp`, `.gif`.
     Animated gifs are truncated to the first frame.
 
     Args:
         directory: Directory where the data is located.
-            If `labels` is "inferred", it should contain
+            If `labels` is `"inferred"`, it should contain
             subdirectories, each containing images for a class.
             Otherwise, the directory structure is ignored.
-      labels: Either "inferred"
+        labels: Either `"inferred"`
             (labels are generated from the directory structure),
-            None (no labels),
+            `None` (no labels),
             or a list/tuple of integer labels of the same size as the number of
             image files found in the directory. Labels should be sorted
             according to the alphanumeric order of the image file paths
             (obtained via `os.walk(directory)` in Python).
-      label_mode: String describing the encoding of `labels`. Options are:
-          - 'int': means that the labels are encoded as integers
-              (e.g. for `sparse_categorical_crossentropy` loss).
-          - 'categorical' means that the labels are
-              encoded as a categorical vector
-              (e.g. for `categorical_crossentropy` loss).
-          - 'binary' means that the labels (there can be only 2)
-              are encoded as `float32` scalars with values 0 or 1
-              (e.g. for `binary_crossentropy`).
-          - None (no labels).
-      class_names: Only valid if "labels" is "inferred". This is the explicit
-          list of class names (must match names of subdirectories). Used
-          to control the order of the classes (otherwise alphanumerical order
-          is used).
-      color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
-          Whether the images will be converted to have 1, 3, or 4 channels.
-      batch_size: Size of the batches of data. Default: 32.
+        label_mode: String describing the encoding of `labels`. Options are:
+            - `"int"`: means that the labels are encoded as integers
+                (e.g. for `sparse_categorical_crossentropy` loss).
+            - `"categorical"` means that the labels are
+                encoded as a categorical vector
+                (e.g. for `categorical_crossentropy` loss).
+            - `"binary"` means that the labels (there can be only 2)
+                are encoded as `float32` scalars with values 0 or 1
+                (e.g. for `binary_crossentropy`).
+            - `None` (no labels).
+        class_names: Only valid if `labels` is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        color_mode: One of `"grayscale"`, `"rgb"`, `"rgba"`.
+            Defaults to `"rgb"`. Whether the images will be converted to
+            have 1, 3, or 4 channels.
+        batch_size: Size of the batches of data.
             If `None`, the data will not be batched
-            (the dataset will yield individual samples).
-      image_size: Size to resize images to after they are read from disk,
-          specified as `(height, width)`. Defaults to `(256, 256)`.
-          Since the pipeline processes batches of images that must all have
-          the same size, this must be provided.
-      shuffle: Whether to shuffle the data. Default: True.
-          If set to False, sorts the data in alphanumeric order.
-      seed: Optional random seed for shuffling and transformations.
-      validation_split: Optional float between 0 and 1,
-          fraction of data to reserve for validation.
-      subset: Subset of the data to return.
-          One of "training", "validation" or "both".
-          Only used if `validation_split` is set.
-          When `subset="both"`, the utility returns a tuple of two datasets
-          (the training and validation datasets respectively).
-      interpolation: String, the interpolation method used when resizing images.
-            Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`,
-            `lanczos5`, `gaussian`, `mitchellcubic`. Defaults to `bilinear`.
-      follow_links: Whether to visit subdirectories pointed to by symlinks.
-          Defaults to `False`.
-      crop_to_aspect_ratio: If True, resize the images without aspect
+            (the dataset will yield individual samples). Defaults to 32.
+        image_size: Size to resize images to after they are read from disk,
+            specified as `(height, width)`.
+            Since the pipeline processes batches of images that must all have
+            the same size, this must be provided. Defaults to `(256, 256)`.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        validation_split: Optional float between 0 and 1,
+            fraction of data to reserve for validation.
+        subset: Subset of the data to return.
+            One of `"training"`, `"validation"`, or `"both"`.
+            Only used if `validation_split` is set.
+            When `subset="both"`, the utility returns a tuple of two datasets
+            (the training and validation datasets respectively).
+        interpolation: String, the interpolation method used when
+            resizing images. Defaults to `"bilinear"`.
+            Supports `"bilinear"`, `"nearest"`, `"bicubic"`, `"area"`,
+            `"lanczos3"`, `"lanczos5"`, `"gaussian"`, `"mitchellcubic"`.
+        follow_links: Whether to visit subdirectories pointed to by symlinks.
+            Defaults to `False`.
+        crop_to_aspect_ratio: If `True`, resize the images without aspect
             ratio distortion. When the original aspect ratio differs from the
             target aspect ratio, the output image will be cropped so as to
             return the largest possible window in the image
             (of size `image_size`) that matches the target aspect ratio. By
             default (`crop_to_aspect_ratio=False`), aspect ratio may not be
             preserved.
-      **kwargs: Legacy keyword arguments.
+        **kwargs: Legacy keyword arguments.
 
     Returns:
-      A `tf.data.Dataset` object.
 
-        - If `label_mode` is None, it yields `float32` tensors of shape
-            `(batch_size, image_size[0], image_size[1], num_channels)`,
-            encoding images (see below for rules regarding `num_channels`).
-        - Otherwise, it yields a tuple `(images, labels)`, where `images` has
-            shape `(batch_size, image_size[0], image_size[1], num_channels)`,
-            and `labels` follows the format described below.
+    A `tf.data.Dataset` object.
+
+    - If `label_mode` is `None`, it yields `float32` tensors of shape
+        `(batch_size, image_size[0], image_size[1], num_channels)`,
+        encoding images (see below for rules regarding `num_channels`).
+    - Otherwise, it yields a tuple `(images, labels)`, where `images` has
+        shape `(batch_size, image_size[0], image_size[1], num_channels)`,
+        and `labels` follows the format described below.
 
     Rules regarding labels format:
 
-      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-          `(batch_size,)`.
-      - if `label_mode` is `binary`, the labels are a `float32` tensor of
-          1s and 0s of shape `(batch_size, 1)`.
-      - if `label_mode` is `categorical`, the labels are a `float32` tensor
-          of shape `(batch_size, num_classes)`, representing a one-hot
-          encoding of the class index.
+    - if `label_mode` is `"int"`, the labels are an `int32` tensor of shape
+        `(batch_size,)`.
+    - if `label_mode` is `"binary"`, the labels are a `float32` tensor of
+        1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `"categorical"`, the labels are a `float32` tensor
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
 
     Rules regarding number of channels in the yielded images:
 
-      - if `color_mode` is `grayscale`,
-          there's 1 channel in the image tensors.
-      - if `color_mode` is `rgb`,
-          there are 3 channels in the image tensors.
-      - if `color_mode` is `rgba`,
-          there are 4 channels in the image tensors.
+    - if `color_mode` is `"grayscale"`,
+        there's 1 channel in the image tensors.
+    - if `color_mode` is `"rgb"`,
+        there are 3 channels in the image tensors.
+    - if `color_mode` is `"rgba"`,
+        there are 4 channels in the image tensors.
     """
     if "smart_resize" in kwargs:
         crop_to_aspect_ratio = kwargs.pop("smart_resize")
diff --git a/keras/utils/text_dataset.py b/keras/utils/text_dataset.py
index f05a6e5f9cbc..37ba1a94b10c 100644
--- a/keras/utils/text_dataset.py
+++ b/keras/utils/text_dataset.py
@@ -64,42 +64,42 @@ def text_dataset_from_directory(
 
     Args:
         directory: Directory where the data is located.
-            If `labels` is "inferred", it should contain
+            If `labels` is `"inferred"`, it should contain
             subdirectories, each containing text files for a class.
             Otherwise, the directory structure is ignored.
-        labels: Either "inferred"
+        labels: Either `"inferred"`
             (labels are generated from the directory structure),
-            None (no labels),
+            `None` (no labels),
             or a list/tuple of integer labels of the same size as the number of
             text files found in the directory. Labels should be sorted according
             to the alphanumeric order of the text file paths
             (obtained via `os.walk(directory)` in Python).
         label_mode: String describing the encoding of `labels`. Options are:
-            - 'int': means that the labels are encoded as integers
+            - `"int"`: means that the labels are encoded as integers
                 (e.g. for `sparse_categorical_crossentropy` loss).
-            - 'categorical' means that the labels are
+            - `"categorical"` means that the labels are
                 encoded as a categorical vector
                 (e.g. for `categorical_crossentropy` loss).
-            - 'binary' means that the labels (there can be only 2)
+            - `"binary"` means that the labels (there can be only 2)
                 are encoded as `float32` scalars with values 0 or 1
                 (e.g. for `binary_crossentropy`).
-            - None (no labels).
-        class_names: Only valid if "labels" is "inferred". This is the explicit
-            list of class names (must match names of subdirectories). Used
-            to control the order of the classes
-            (otherwise alphanumerical order is used).
-        batch_size: Size of the batches of data. Default: 32.
+            - `None` (no labels).
+        class_names: Only valid if `"labels"` is `"inferred"`.
+            This is the explicit list of class names
+            (must match names of subdirectories). Used to control the order
+            of the classes (otherwise alphanumerical order is used).
+        batch_size: Size of the batches of data. Defaults to 32.
             If `None`, the data will not be batched
             (the dataset will yield individual samples).
         max_length: Maximum size of a text string. Texts longer than this will
             be truncated to `max_length`.
-        shuffle: Whether to shuffle the data. Default: True.
-            If set to False, sorts the data in alphanumeric order.
+        shuffle: Whether to shuffle the data. Defaults to `True`.
+            If set to `False`, sorts the data in alphanumeric order.
         seed: Optional random seed for shuffling and transformations.
         validation_split: Optional float between 0 and 1,
             fraction of data to reserve for validation.
         subset: Subset of the data to return.
-            One of "training", "validation" or "both".
+            One of `"training"`, `"validation"` or `"both"`.
             Only used if `validation_split` is set.
             When `subset="both"`, the utility returns a tuple of two datasets
             (the training and validation datasets respectively).
@@ -107,21 +107,24 @@ def text_dataset_from_directory(
             Defaults to `False`.
 
     Returns:
-        A `tf.data.Dataset` object.
-        - If `label_mode` is None, it yields `string` tensors of shape
-          `(batch_size,)`, containing the contents of a batch of text files.
-        - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
-          has shape `(batch_size,)` and `labels` follows the format described
-          below.
+
+    A `tf.data.Dataset` object.
+
+    - If `label_mode` is `None`, it yields `string` tensors of shape
+        `(batch_size,)`, containing the contents of a batch of text files.
+    - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
+        has shape `(batch_size,)` and `labels` follows the format described
+        below.
 
     Rules regarding labels format:
-        - if `label_mode` is `int`, the labels are an `int32` tensor of shape
-          `(batch_size,)`.
-        - if `label_mode` is `binary`, the labels are a `float32` tensor of
-          1s and 0s of shape `(batch_size, 1)`.
-        - if `label_mode` is `categorical`, the labels are a `float32` tensor
-          of shape `(batch_size, num_classes)`, representing a one-hot
-          encoding of the class index.
+
+    - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+        `(batch_size,)`.
+    - if `label_mode` is `binary`, the labels are a `float32` tensor of
+        1s and 0s of shape `(batch_size, 1)`.
+    - if `label_mode` is `categorical`, the labels are a `float32` tensor
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
     """
     if labels not in ("inferred", None):
         if not isinstance(labels, (list, tuple)):
diff --git a/keras/utils/timeseries_dataset.py b/keras/utils/timeseries_dataset.py
index 60c37b116d94..c81dc18ef32c 100644
--- a/keras/utils/timeseries_dataset.py
+++ b/keras/utils/timeseries_dataset.py
@@ -46,41 +46,43 @@ def timeseries_dataset_from_array(
     to produce batches of timeseries inputs and targets.
 
     Args:
-      data: Numpy array or eager tensor
-        containing consecutive data points (timesteps).
-        Axis 0 is expected to be the time dimension.
-      targets: Targets corresponding to timesteps in `data`.
-        `targets[i]` should be the target
-        corresponding to the window that starts at index `i`
-        (see example 2 below).
-        Pass None if you don't have target data (in this case the dataset will
-        only yield the input data).
-      sequence_length: Length of the output sequences (in number of timesteps).
-      sequence_stride: Period between successive output sequences.
-        For stride `s`, output samples would
-        start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
-      sampling_rate: Period between successive individual timesteps
-        within sequences. For rate `r`, timesteps
-        `data[i], data[i + r], ... data[i + sequence_length]`
-        are used for creating a sample sequence.
-      batch_size: Number of timeseries samples in each batch
-        (except maybe the last one). If `None`, the data will not be batched
-        (the dataset will yield individual samples).
-      shuffle: Whether to shuffle output samples,
-        or instead draw them in chronological order.
-      seed: Optional int; random seed for shuffling.
-      start_index: Optional int; data points earlier (exclusive)
-        than `start_index` will not be used
-        in the output sequences. This is useful to reserve part of the
-        data for test or validation.
-      end_index: Optional int; data points later (exclusive) than `end_index`
-        will not be used in the output sequences.
-        This is useful to reserve part of the data for test or validation.
+        data: Numpy array or eager tensor
+            containing consecutive data points (timesteps).
+            Axis 0 is expected to be the time dimension.
+        targets: Targets corresponding to timesteps in `data`.
+            `targets[i]` should be the target
+            corresponding to the window that starts at index `i`
+            (see example 2 below).
+            Pass `None` if you don't have target data (in this case the dataset
+            will only yield the input data).
+        sequence_length: Length of the output sequences
+            (in number of timesteps).
+        sequence_stride: Period between successive output sequences.
+            For stride `s`, output samples would
+            start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
+        sampling_rate: Period between successive individual timesteps
+            within sequences. For rate `r`, timesteps
+            `data[i], data[i + r], ... data[i + sequence_length]`
+            are used for creating a sample sequence.
+        batch_size: Number of timeseries samples in each batch
+            (except maybe the last one). If `None`, the data will not be batched
+            (the dataset will yield individual samples).
+        shuffle: Whether to shuffle output samples,
+            or instead draw them in chronological order.
+        seed: Optional int; random seed for shuffling.
+        start_index: Optional int; data points earlier (exclusive)
+            than `start_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        end_index: Optional int; data points later (exclusive) than `end_index`
+            will not be used in the output sequences.
+            This is useful to reserve part of the data for test or validation.
 
     Returns:
-      A tf.data.Dataset instance. If `targets` was passed, the dataset yields
-      tuple `(batch_of_sequences, batch_of_targets)`. If not, the dataset yields
-      only `batch_of_sequences`.
+
+    A `tf.data.Dataset` instance. If `targets` was passed, the dataset yields
+    tuple `(batch_of_sequences, batch_of_targets)`. If not, the dataset yields
+    only `batch_of_sequences`.
 
     Example 1:
 
@@ -134,17 +136,17 @@ def timeseries_dataset_from_array(
 
     sample_length = 20
     input_dataset = tf.keras.utils.timeseries_dataset_from_array(
-      X, None, sequence_length=sample_length, sequence_stride=sample_length)
+        X, None, sequence_length=sample_length, sequence_stride=sample_length)
     target_dataset = tf.keras.utils.timeseries_dataset_from_array(
-      Y, None, sequence_length=sample_length, sequence_stride=sample_length)
+        Y, None, sequence_length=sample_length, sequence_stride=sample_length)
 
     for batch in zip(input_dataset, target_dataset):
-      inputs, targets = batch
-      assert np.array_equal(inputs[0], X[:sample_length])
+        inputs, targets = batch
+        assert np.array_equal(inputs[0], X[:sample_length])
 
-      # second sample equals output timestamps 20-40
-      assert np.array_equal(targets[1], Y[sample_length:2*sample_length])
-      break
+        # second sample equals output timestamps 20-40
+        assert np.array_equal(targets[1], Y[sample_length:2*sample_length])
+        break
     ```
     """
     if start_index:
diff --git a/pip_build.py b/pip_build.py
index 708f1dc75d5b..6c09c1ccb7a3 100644
--- a/pip_build.py
+++ b/pip_build.py
@@ -338,6 +338,7 @@ def build_pip_package(
     src_directory,
     dist_directory,
     is_nightly=False,
+    rc=None,
 ):
     # Build Keras with Bazel to get the protobuf .py files
     os.chdir(keras_root_directory)
@@ -383,6 +384,8 @@ def build_pip_package(
     if is_nightly:
         date = datetime.datetime.now()
         version += f".dev{date.strftime('%Y%m%d%H')}"
+    elif rc:
+        version += rc
     with open(os.path.join(package_directory, "__init__.py")) as f:
         init_contents = f.read()
     with open(os.path.join(package_directory, "__init__.py"), "w") as f:
@@ -455,8 +458,14 @@ def test_wheel(wheel_path, expected_version, requirements_path):
         action="store_true",
         help="Whether this is for the `keras-nightly` package.",
     )
+    parser.add_argument(
+        "--RC",
+        type=str,
+        help="Whether this is for the release candidate.",
+    )
     args = parser.parse_args()
     is_nightly = args.nightly
+    rc = args.RC
 
     build_directory = os.path.join(tempfile.gettempdir(), TMP_BUILD_DIRNAME)
     keras_root_directory = pathlib.Path(__file__).parent.resolve()
@@ -471,7 +480,8 @@ def test_wheel(wheel_path, expected_version, requirements_path):
             f"dist_directory={dist_directory}\n"
             f"package_directory={package_directory}\n"
             f"src_directory={src_directory}\n"
-            f"is_nightly={is_nightly}"
+            f"is_nightly={is_nightly}\n"
+            f"rc={rc}"
         )
     if os.path.exists(build_directory):
         raise ValueError(f"Directory already exists: {build_directory}")
@@ -487,6 +497,7 @@ def test_wheel(wheel_path, expected_version, requirements_path):
             src_directory,
             dist_directory,
             is_nightly,
+            rc,
         )
         wheel_filename = [f for f in saved_filenames if f.endswith(".whl")][0]
         if VERBOSE: