keras-team · mattdangerw · Jul 31, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/keras_nlp/samplers/beam_sampler.py b/keras_nlp/samplers/beam_sampler.py
@@ -14,14 +14,13 @@
 """Beam Sampler."""
 
 import tensorflow as tf
-from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
+from keras_nlp.backend import ops
 from keras_nlp.samplers.sampler import Sampler
 from keras_nlp.samplers.sampler import call_args_docstring
 from keras_nlp.utils.python_utils import format_docstring
-from keras_nlp.utils.tensor_utils import assert_tf_backend
 
 
 @format_docstring(call_args=call_args_docstring)
@@ -102,10 +101,6 @@ def __init__(
         return_all_beams=False,
         **kwargs,
     ):
-        # Temporarily turn off beam search in other backends.
-        # No technical blockers here, just need tf -> ops rewrite.
-        assert_tf_backend(self.__class__.__name__)
-
         super().__init__(**kwargs)
         self.num_beams = num_beams
         self.return_all_beams = return_all_beams
@@ -120,89 +115,96 @@ def __call__(
         end_token_id=None,
         hidden_states=None,
     ):
-        batch_size, max_length = tf.shape(prompt)[0], tf.shape(prompt)[1]
-        # Make sure max length and start index are the same dtype.
-        index = tf.cast(index, max_length.dtype)
+        batch_size, max_length = ops.shape(prompt)[0], ops.shape(prompt)[1]
+        index = ops.cast(index, "int32")
 
         def create_beams(x):
             """Add initial beam state."""
-            return tf.repeat(x, self.num_beams, axis=0)
+            return ops.repeat(x, self.num_beams, axis=0)
 
         def flatten_beams(x):
             """Combine the beam dim and batch dim."""
-            flat_shape = [batch_size * self.num_beams] + x.shape.as_list()[2:]
-            return tf.reshape(x, shape=flat_shape)
+            flat_shape = (batch_size * self.num_beams,) + tuple(x.shape)[2:]
+            return ops.reshape(x, flat_shape)
 
         def unflatten_beams(x):
             """Separate the beam dim and batch dim."""
-            unflat_shape = [batch_size, self.num_beams] + x.shape.as_list()[1:]
-            return tf.reshape(x, shape=unflat_shape)
+            unflat_shape = (batch_size, self.num_beams) + tuple(x.shape)[1:]
+            return ops.reshape(x, unflat_shape)
 
         if mask is None:
-            mask = tf.zeros_like(prompt, dtype="bool")
+            mask = ops.zeros_like(prompt, dtype="bool")
         else:
-            mask = tf.cast(mask, dtype="bool")
-        # `tf.while_loop` will not accept `None` as a value for `loop_vars`.
-        cache = () if cache is None else cache
+            mask = ops.cast(mask, dtype="bool")
+        # `ops.while_loop` will not accept `None` as a value for `loop_vars`.
+        has_cache = cache is not None
+        cache = cache if has_cache else ()
         # Add extra sequences for each beam.
         prompt, mask = create_beams(prompt), create_beams(mask)
         cache = tf.nest.map_structure(create_beams, cache)
         # Setup the initial beam log-likelihoods.
         # On the first loop, make sure only the original beam is considered.
-        log_probs = tf.constant([[0.0] + [-1e9] * (self.num_beams - 1)])
-        log_probs = flatten_beams(tf.repeat(log_probs, batch_size, axis=0))
+        log_probs = ops.array(
+            [[0.0] + [-1e9] * (self.num_beams - 1)], dtype="float32"
+        )
+        log_probs = flatten_beams(ops.repeat(log_probs, batch_size, axis=0))
 
         def cond(prompt, cache, index, log_probs):
             if end_token_id is None:
                 return True
             # Stop if all sequences have produced a *new* end_token_id.
             end_tokens = (prompt == end_token_id) & (~mask)
-            prompt_done = tf.reduce_any(end_tokens, axis=-1)
-            return not tf.reduce_all(prompt_done)
+            prompt_done = ops.any(end_tokens, axis=-1)
+            return ops.logical_not(ops.all(prompt_done))
 
         def body(prompt, cache, index, log_probs):
             # Compute the softmax distribution for the next token.
             logits, _, cache = next(prompt, cache, index)
-            vocab_size = tf.shape(logits)[-1]
+            vocab_size = ops.shape(logits)[-1]
             probs = keras.activations.softmax(logits / self.temperature)
 
             # Compute the running log-likelihood of each new candidate.
-            next_log_probs = tf.math.log(probs) + log_probs[..., tf.newaxis]
+            next_log_probs = ops.log(probs) + log_probs[..., None]
             # Reshape `preds` to shape `(batch_size, num_beams * vocab_size)`.
-            next_log_probs = tf.reshape(next_log_probs, shape=[batch_size, -1])
+            next_log_probs = ops.reshape(next_log_probs, [batch_size, -1])
 
             # Compute the top beam indices and next tokens.
-            next_log_probs, indices = tf.math.top_k(
+            next_log_probs, indices = ops.top_k(
                 next_log_probs, k=self.num_beams, sorted=False
             )
             beam_indices = indices // vocab_size
             next_token = flatten_beams(indices % vocab_size)
-            # Ensure shape is `[None]`, otherwise it causes issues after
-            # converting to TFLite.
-            next_token = tf.ensure_shape(next_token, [None])
             # We need `ensure_shape` as `top_k` will change the static shape.
             next_log_probs = flatten_beams(next_log_probs)
-            log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)
+            # Work around for top_k output shape on tf backend.
+            if isinstance(log_probs, tf.Tensor):
+                log_probs = tf.ensure_shape(next_log_probs, log_probs.shape)
+            else:
+                log_probs = next_log_probs
 
             def gather_beams(x):
                 x = unflatten_beams(x)
-                x = tf.gather(x, beam_indices, axis=1, batch_dims=1)
+                indices = beam_indices
+                for axis in range(2, len(x.shape)):
+                    indices = ops.expand_dims(indices, axis=axis)
+                x = ops.take_along_axis(x, indices, axis=1)
                 return flatten_beams(x)
 
             prompt = gather_beams(prompt)
-            cache = tf.nest.map_structure(gather_beams, cache)
+            if has_cache:
+                cache = tf.nest.map_structure(gather_beams, cache)
 
             # Update each beam with the next token.
-            next_token = tf.cast(next_token, prompt.dtype)
+            next_token = ops.cast(next_token, prompt.dtype)
             # Don't overwrite anywhere mask is True.
-            next_token = tf.where(mask[:, index], prompt[:, index], next_token)
+            next_token = ops.where(mask[:, index], prompt[:, index], next_token)
             # Update the prompt with the next token.
-            next_token = next_token[:, tf.newaxis]
-            prompt = dynamic_update_slice(prompt, next_token, [0, index])
+            next_token = next_token[:, None]
+            prompt = ops.slice_update(prompt, [0, index], next_token)
             # Return the iteration of the loop state.
             return (prompt, cache, index + 1, log_probs)
 
-        prompt, _, _, log_probs = tf.while_loop(
+        prompt, _, _, log_probs = self.run_loop(
             cond=cond,
             body=body,
             loop_vars=(prompt, cache, index, log_probs),
@@ -213,21 +215,23 @@ def gather_beams(x):
         all_log_probs = unflatten_beams(log_probs)
 
         if self.return_all_beams:
-            sorted_indices = tf.argsort(
-                all_log_probs, axis=-1, direction="DESCENDING"
-            )
-            sorted_log_probs = tf.gather(
-                all_log_probs, sorted_indices, axis=-1, batch_dims=1
+            sorted_indices = ops.argsort(-all_log_probs, axis=-1)
+            sorted_log_probs = ops.take_along_axis(
+                all_log_probs,
+                sorted_indices,
+                axis=1,
             )
-            sorted_prompts = tf.gather(
-                all_prompts, sorted_indices, axis=1, batch_dims=1
+            sorted_prompts = ops.take_along_axis(
+                all_prompts,
+                ops.expand_dims(sorted_indices, -1),
+                axis=1,
             )
             return sorted_prompts, sorted_log_probs
         else:
             # Gather the top beam at each batch index.
-            top_beams = tf.math.argmax(all_log_probs, axis=-1)[:, tf.newaxis]
-            prompt = tf.gather(all_prompts, top_beams, axis=1, batch_dims=1)
-            return tf.squeeze(prompt, axis=1)
+            top_beams = ops.argmax(all_log_probs, axis=-1)[:, None, None]
+            prompt = ops.take_along_axis(all_prompts, top_beams, axis=1)
+            return ops.squeeze(prompt, axis=1)
 
     def get_config(self):
         config = super().get_config()

diff --git a/keras_nlp/samplers/beam_sampler_test.py b/keras_nlp/samplers/beam_sampler_test.py
@@ -22,7 +22,6 @@
 from keras_nlp.tests.test_case import TestCase
 
 
-@pytest.mark.tf_only
 class BeamSamplerTest(TestCase):
     def setUp(self):
         super().setUp()