From 2ef6b03f89d60dc432cd4118bfeafff988e6baea Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 12:01:05 -0400
Subject: [PATCH 1/7] Implementing minimum number of decoding steps

---
 allennlp/nn/beam_search.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
index fff07b7dac2..1e7106aa886 100644
--- a/allennlp/nn/beam_search.py
+++ b/allennlp/nn/beam_search.py
@@ -462,6 +462,10 @@ class BeamSearch(FromParams):
 
         Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you
         [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039).
+
+    min_steps : `int`, optional (default = `0`)
+        The minimum number of decoding steps to take, i.e. the minimum length of
+        the predicted sequences.
     """
 
     def __init__(
@@ -471,6 +475,7 @@ def __init__(
         beam_size: int = 10,
         per_node_beam_size: int = None,
         sampler: Sampler = None,
+        min_steps: int = 0,
     ) -> None:
         if not max_steps > 0:
             raise ValueError("max_steps must be positive")
@@ -478,12 +483,17 @@ def __init__(
             raise ValueError("beam_size must be positive")
         if per_node_beam_size is not None and not per_node_beam_size > 0:
             raise ValueError("per_node_beam_size must be positive")
+        if not min_steps >= 0:
+            raise ValueError("min_steps must be non-negative")
+        if not min_steps <= max_steps:
+            raise ValueError("min_steps must be less than or equal to max_steps")
 
         self._end_index = end_index
         self.max_steps = max_steps
         self.beam_size = beam_size
         self.per_node_beam_size = per_node_beam_size or beam_size
         self.sampler = sampler or DeterministicSampler()
+        self.min_steps = min_steps
 
     @staticmethod
     def _reconstruct_sequences(predictions, backpointers):
@@ -629,6 +639,10 @@ def _search(
             start_class_log_probabilities, batch_size, num_classes
         )
 
+        # Prevent selecting the end symbol if there is any min_steps constraint
+        if self.min_steps >= 1:
+            start_class_log_probabilities[:, self._end_index] = float("-inf")
+
         # Get the initial predicted classed and their log probabilities.
         # shape: (batch_size, beam_size), (batch_size, beam_size)
         (
@@ -675,6 +689,13 @@ def _search(
             # shape: (batch_size * beam_size, num_classes)
             class_log_probabilities, state = step(last_predictions, state, timestep + 1)
 
+            # The `timestep`-th iteration of the for loop is generating the `timestep + 2`-th token
+            # of the sequence (because `timestep` is 0-indexed and we generated the first token
+            # before the for loop). Here we block the end index if the search is not allowed to
+            # terminate on this iteration.
+            if timestep + 2 <= self.min_steps:
+                class_log_probabilities[:, self._end_index] = float("-inf")
+
             # shape: (batch_size * beam_size, num_classes)
             last_predictions_expanded = last_predictions.unsqueeze(-1).expand(
                 batch_size * self.beam_size, num_classes

From cb51507dc98c737865033f7472a720e752396aa5 Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 14:06:56 -0400
Subject: [PATCH 2/7] Adding unit tests

---
 tests/nn/beam_search_test.py | 99 +++++++++++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/tests/nn/beam_search_test.py b/tests/nn/beam_search_test.py
index 88614f88e9d..8805e2be26d 100644
--- a/tests/nn/beam_search_test.py
+++ b/tests/nn/beam_search_test.py
@@ -27,6 +27,18 @@
     ]  # end token -> jth token
 )
 
+# A transition matrix that favors shorter sequences over longer ones
+short_sequence_transition_probabilities = torch.tensor(
+    [
+        [0.0, 0.1, 0.0, 0.0, 0.0, 0.9],  # start token -> jth token
+        [0.0, 0.0, 0.1, 0.0, 0.0, 0.9],  # 1st token -> jth token
+        [0.0, 0.0, 0.0, 0.1, 0.0, 0.9],  # 2nd token -> jth token
+        [0.0, 0.0, 0.0, 0.0, 0.1, 0.9],  # ...
+        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],  # ...
+        [0.2, 0.1, 0.2, 0.2, 0.2, 0.3],
+    ]  # end token -> jth token
+)
+
 log_probabilities = torch.log(
     torch.tensor([[0.1, 0.3, 0.3, 0.3, 0.0, 0.0], [0.0, 0.0, 0.4, 0.3, 0.2, 0.1]])
 )
@@ -62,6 +74,25 @@ def take_step_with_timestep(
     return take_step_no_timestep(last_predictions, state)
 
 
+def take_short_sequence_step(
+    last_predictions: torch.Tensor,
+    state: Dict[str, torch.Tensor],
+    timestep: int,
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    """
+    Take decoding step.
+
+    This method is the same as `take_step_no_timestep` except it uses the
+    `short_sequence_transition_probabilities` transitions instead of `transition_probabilities`
+    """
+    log_probs_list = []
+    for last_token in last_predictions:
+        log_probs = torch.log(short_sequence_transition_probabilities[last_token.item()])
+        log_probs_list.append(log_probs)
+
+    return torch.stack(log_probs_list), state
+
+
 class BeamSearchTest(AllenNlpTestCase):
     def setup_method(self):
         super().setup_method()
@@ -101,7 +132,7 @@ def _check_results(
 
         # log_probs should be shape `(batch_size, beam_size, max_predicted_length)`.
         assert list(log_probs.size()) == [batch_size, beam_size]
-        np.testing.assert_allclose(log_probs[0].numpy(), expected_log_probs)
+        np.testing.assert_allclose(log_probs[0].numpy(), expected_log_probs, rtol=1e-6)
 
     @pytest.mark.parametrize("step_function", [take_step_with_timestep, take_step_no_timestep])
     def test_search(self, step_function):
@@ -211,6 +242,72 @@ def test_early_stopping(self):
             beam_search=beam_search,
         )
 
+    def test_take_short_sequence_step(self):
+        """
+        Tests to ensure the top-k from the short_sequence_transition_probabilities
+        transition matrix is expected
+        """
+        self.beam_search.beam_size = 5
+        expected_top_k = np.array([
+            [5, 5, 5, 5, 5],
+            [1, 5, 5, 5, 5],
+            [1, 2, 5, 5, 5],
+            [1, 2, 3, 5, 5],
+            [1, 2, 3, 4, 5]
+        ])
+        expected_log_probs = np.log(np.array([0.9, 0.09, 0.009, 0.0009, 0.0001]))
+        self._check_results(
+            expected_top_k=expected_top_k,
+            expected_log_probs=expected_log_probs,
+            take_step=take_short_sequence_step,
+        )
+
+    def test_min_steps(self):
+        """
+        Tests to ensure all output sequences are greater than a specified minimum length.
+        It uses the `take_short_sequence_step` step function, which favors shorter sequences.
+        See `test_take_short_sequence_step`.
+        """
+        self.beam_search.beam_size = 1
+
+        # An empty sequence is allowed under this step function
+        self.beam_search.min_steps = 0
+        expected_top_k = np.array([[5]])
+        expected_log_probs = np.log(np.array([0.9]))
+        self._check_results(
+            expected_top_k=expected_top_k,
+            expected_log_probs=expected_log_probs,
+            take_step=take_short_sequence_step,
+        )
+
+        self.beam_search.min_steps = 1
+        expected_top_k = np.array([[1, 5]])
+        expected_log_probs = np.log(np.array([0.09]))
+        self._check_results(
+            expected_top_k=expected_top_k,
+            expected_log_probs=expected_log_probs,
+            take_step=take_short_sequence_step,
+        )
+
+        self.beam_search.min_steps = 2
+        expected_top_k = np.array([[1, 2, 5]])
+        expected_log_probs = np.log(np.array([0.009]))
+        self._check_results(
+            expected_top_k=expected_top_k,
+            expected_log_probs=expected_log_probs,
+            take_step=take_short_sequence_step,
+        )
+
+        self.beam_search.beam_size = 3
+        self.beam_search.min_steps = 2
+        expected_top_k = np.array([[1, 2, 5, 5, 5], [1, 2, 3, 5, 5], [1, 2, 3, 4, 5]])
+        expected_log_probs = np.log(np.array([0.009, 0.0009, 0.0001]))
+        self._check_results(
+            expected_top_k=expected_top_k,
+            expected_log_probs=expected_log_probs,
+            take_step=take_short_sequence_step,
+        )
+
     def test_different_per_node_beam_size(self):
         # per_node_beam_size = 1
         beam_search = BeamSearch(self.end_index, beam_size=3, per_node_beam_size=1)

From 3b08851b4cb3998b8ac698ff99f03c6a26f20e86 Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 14:32:10 -0400
Subject: [PATCH 3/7] Reformatting

---
 tests/nn/beam_search_test.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/nn/beam_search_test.py b/tests/nn/beam_search_test.py
index 8805e2be26d..4fcd892ab91 100644
--- a/tests/nn/beam_search_test.py
+++ b/tests/nn/beam_search_test.py
@@ -248,13 +248,9 @@ def test_take_short_sequence_step(self):
         transition matrix is expected
         """
         self.beam_search.beam_size = 5
-        expected_top_k = np.array([
-            [5, 5, 5, 5, 5],
-            [1, 5, 5, 5, 5],
-            [1, 2, 5, 5, 5],
-            [1, 2, 3, 5, 5],
-            [1, 2, 3, 4, 5]
-        ])
+        expected_top_k = np.array(
+            [[5, 5, 5, 5, 5], [1, 5, 5, 5, 5], [1, 2, 5, 5, 5], [1, 2, 3, 5, 5], [1, 2, 3, 4, 5]]
+        )
         expected_log_probs = np.log(np.array([0.9, 0.09, 0.009, 0.0009, 0.0001]))
         self._check_results(
             expected_top_k=expected_top_k,

From 9e5238a02733da94ca2ab33526326c1e39c3338d Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 14:35:43 -0400
Subject: [PATCH 4/7] Adding entry to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a55610dea5e..ebeaf83c3db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   You can do this by setting the parameter `load_weights` to `False`.
   See [PR #5172](https://github.com/allenai/allennlp/pull/5172) for more details.
 - Added `SpanExtractorWithSpanWidthEmbedding`, putting specific span embedding computations into the `_embed_spans` method and leaving the common code in `SpanExtractorWithSpanWidthEmbedding` to unify the arguments, and modified `BidirectionalEndpointSpanExtractor`, `EndpointSpanExtractor` and `SelfAttentiveSpanExtractor` accordingly. Now, `SelfAttentiveSpanExtractor` can also embed span widths.
+- Added a `min_steps` parameter to `BeamSearch` to set a minimum length for the predicted sequences.
 
 ### Fixed
 

From 8e9654397dbcedd179592f1a5aec9565c259e9af Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 15:21:34 -0400
Subject: [PATCH 5/7] Adding end token comment

---
 allennlp/nn/beam_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
index 1e7106aa886..98c7038767f 100644
--- a/allennlp/nn/beam_search.py
+++ b/allennlp/nn/beam_search.py
@@ -465,7 +465,7 @@ class BeamSearch(FromParams):
 
     min_steps : `int`, optional (default = `0`)
         The minimum number of decoding steps to take, i.e. the minimum length of
-        the predicted sequences.
+        the predicted sequences. This does not include the end token.
     """
 
     def __init__(

From 90c8cf7e0f2aa51b1e221879073b700eba833ec8 Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 15:28:49 -0400
Subject: [PATCH 6/7] Adding start token comment

---
 allennlp/nn/beam_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
index 98c7038767f..161134ffb2b 100644
--- a/allennlp/nn/beam_search.py
+++ b/allennlp/nn/beam_search.py
@@ -465,7 +465,7 @@ class BeamSearch(FromParams):
 
     min_steps : `int`, optional (default = `0`)
         The minimum number of decoding steps to take, i.e. the minimum length of
-        the predicted sequences. This does not include the end token.
+        the predicted sequences. This does not include the start or end tokens.
     """
 
     def __init__(

From 0014f72895368319363a020129f71fdef1475b8e Mon Sep 17 00:00:00 2001
From: Daniel Deutsch <danfdeutsch@gmail.com>
Date: Mon, 17 May 2021 17:09:53 -0400
Subject: [PATCH 7/7] Changing param to optional

---
 allennlp/nn/beam_search.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
index 161134ffb2b..f1d43226a83 100644
--- a/allennlp/nn/beam_search.py
+++ b/allennlp/nn/beam_search.py
@@ -1,5 +1,5 @@
 from inspect import signature
-from typing import List, Callable, Tuple, Dict, cast, TypeVar
+from typing import List, Callable, Tuple, Dict, cast, TypeVar, Optional
 import warnings
 
 from overrides import overrides
@@ -463,9 +463,10 @@ class BeamSearch(FromParams):
         Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you
         [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039).
 
-    min_steps : `int`, optional (default = `0`)
+    min_steps : `int`, optional (default = `None`)
         The minimum number of decoding steps to take, i.e. the minimum length of
-        the predicted sequences. This does not include the start or end tokens.
+        the predicted sequences. This does not include the start or end tokens. If `None`,
+        no minimum is enforced.
     """
 
     def __init__(
@@ -475,7 +476,7 @@ def __init__(
         beam_size: int = 10,
         per_node_beam_size: int = None,
         sampler: Sampler = None,
-        min_steps: int = 0,
+        min_steps: Optional[int] = None,
     ) -> None:
         if not max_steps > 0:
             raise ValueError("max_steps must be positive")
@@ -483,17 +484,18 @@ def __init__(
             raise ValueError("beam_size must be positive")
         if per_node_beam_size is not None and not per_node_beam_size > 0:
             raise ValueError("per_node_beam_size must be positive")
-        if not min_steps >= 0:
-            raise ValueError("min_steps must be non-negative")
-        if not min_steps <= max_steps:
-            raise ValueError("min_steps must be less than or equal to max_steps")
+        if min_steps is not None:
+            if not min_steps >= 0:
+                raise ValueError("min_steps must be non-negative")
+            if not min_steps <= max_steps:
+                raise ValueError("min_steps must be less than or equal to max_steps")
 
         self._end_index = end_index
         self.max_steps = max_steps
         self.beam_size = beam_size
         self.per_node_beam_size = per_node_beam_size or beam_size
         self.sampler = sampler or DeterministicSampler()
-        self.min_steps = min_steps
+        self.min_steps = min_steps or 0
 
     @staticmethod
     def _reconstruct_sequences(predictions, backpointers):