From 2ef6b03f89d60dc432cd4118bfeafff988e6baea Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 12:01:05 -0400 Subject: [PATCH 1/7] Implementing minimum number of decoding steps --- allennlp/nn/beam_search.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index fff07b7dac2..1e7106aa886 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -462,6 +462,10 @@ class BeamSearch(FromParams): Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039). + + min_steps : `int`, optional (default = `0`) + The minimum number of decoding steps to take, i.e. the minimum length of + the predicted sequences. """ def __init__( @@ -471,6 +475,7 @@ def __init__( beam_size: int = 10, per_node_beam_size: int = None, sampler: Sampler = None, + min_steps: int = 0, ) -> None: if not max_steps > 0: raise ValueError("max_steps must be positive") @@ -478,12 +483,17 @@ def __init__( raise ValueError("beam_size must be positive") if per_node_beam_size is not None and not per_node_beam_size > 0: raise ValueError("per_node_beam_size must be positive") + if not min_steps >= 0: + raise ValueError("min_steps must be non-negative") + if not min_steps <= max_steps: + raise ValueError("min_steps must be less than or equal to max_steps") self._end_index = end_index self.max_steps = max_steps self.beam_size = beam_size self.per_node_beam_size = per_node_beam_size or beam_size self.sampler = sampler or DeterministicSampler() + self.min_steps = min_steps @staticmethod def _reconstruct_sequences(predictions, backpointers): @@ -629,6 +639,10 @@ def _search( start_class_log_probabilities, batch_size, num_classes ) + # Prevent selecting the end symbol if there is any min_steps constraint + if self.min_steps >= 1: + start_class_log_probabilities[:, self._end_index] = float("-inf") + # Get the initial predicted classed and their log probabilities. # shape: (batch_size, beam_size), (batch_size, beam_size) ( @@ -675,6 +689,13 @@ def _search( # shape: (batch_size * beam_size, num_classes) class_log_probabilities, state = step(last_predictions, state, timestep + 1) + # The `timestep`-th iteration of the for loop is generating the `timestep + 2`-th token + # of the sequence (because `timestep` is 0-indexed and we generated the first token + # before the for loop). Here we block the end index if the search is not allowed to + # terminate on this iteration. + if timestep + 2 <= self.min_steps: + class_log_probabilities[:, self._end_index] = float("-inf") + # shape: (batch_size * beam_size, num_classes) last_predictions_expanded = last_predictions.unsqueeze(-1).expand( batch_size * self.beam_size, num_classes From cb51507dc98c737865033f7472a720e752396aa5 Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 14:06:56 -0400 Subject: [PATCH 2/7] Adding unit tests --- tests/nn/beam_search_test.py | 99 +++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/tests/nn/beam_search_test.py b/tests/nn/beam_search_test.py index 88614f88e9d..8805e2be26d 100644 --- a/tests/nn/beam_search_test.py +++ b/tests/nn/beam_search_test.py @@ -27,6 +27,18 @@ ] # end token -> jth token ) +# A transition matrix that favors shorter sequences over longer ones +short_sequence_transition_probabilities = torch.tensor( + [ + [0.0, 0.1, 0.0, 0.0, 0.0, 0.9], # start token -> jth token + [0.0, 0.0, 0.1, 0.0, 0.0, 0.9], # 1st token -> jth token + [0.0, 0.0, 0.0, 0.1, 0.0, 0.9], # 2nd token -> jth token + [0.0, 0.0, 0.0, 0.0, 0.1, 0.9], # ... + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # ... + [0.2, 0.1, 0.2, 0.2, 0.2, 0.3], + ] # end token -> jth token +) + log_probabilities = torch.log( torch.tensor([[0.1, 0.3, 0.3, 0.3, 0.0, 0.0], [0.0, 0.0, 0.4, 0.3, 0.2, 0.1]]) ) @@ -62,6 +74,25 @@ def take_step_with_timestep( return take_step_no_timestep(last_predictions, state) +def take_short_sequence_step( + last_predictions: torch.Tensor, + state: Dict[str, torch.Tensor], + timestep: int, +) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + """ + Take decoding step. + + This method is the same as `take_step_no_timestep` except it uses the + `short_sequence_transition_probabilities` transitions instead of `transition_probabilities` + """ + log_probs_list = [] + for last_token in last_predictions: + log_probs = torch.log(short_sequence_transition_probabilities[last_token.item()]) + log_probs_list.append(log_probs) + + return torch.stack(log_probs_list), state + + class BeamSearchTest(AllenNlpTestCase): def setup_method(self): super().setup_method() @@ -101,7 +132,7 @@ def _check_results( # log_probs should be shape `(batch_size, beam_size, max_predicted_length)`. assert list(log_probs.size()) == [batch_size, beam_size] - np.testing.assert_allclose(log_probs[0].numpy(), expected_log_probs) + np.testing.assert_allclose(log_probs[0].numpy(), expected_log_probs, rtol=1e-6) @pytest.mark.parametrize("step_function", [take_step_with_timestep, take_step_no_timestep]) def test_search(self, step_function): @@ -211,6 +242,72 @@ def test_early_stopping(self): beam_search=beam_search, ) + def test_take_short_sequence_step(self): + """ + Tests to ensure the top-k from the short_sequence_transition_probabilities + transition matrix is expected + """ + self.beam_search.beam_size = 5 + expected_top_k = np.array([ + [5, 5, 5, 5, 5], + [1, 5, 5, 5, 5], + [1, 2, 5, 5, 5], + [1, 2, 3, 5, 5], + [1, 2, 3, 4, 5] + ]) + expected_log_probs = np.log(np.array([0.9, 0.09, 0.009, 0.0009, 0.0001])) + self._check_results( + expected_top_k=expected_top_k, + expected_log_probs=expected_log_probs, + take_step=take_short_sequence_step, + ) + + def test_min_steps(self): + """ + Tests to ensure all output sequences are greater than a specified minimum length. + It uses the `take_short_sequence_step` step function, which favors shorter sequences. + See `test_take_short_sequence_step`. + """ + self.beam_search.beam_size = 1 + + # An empty sequence is allowed under this step function + self.beam_search.min_steps = 0 + expected_top_k = np.array([[5]]) + expected_log_probs = np.log(np.array([0.9])) + self._check_results( + expected_top_k=expected_top_k, + expected_log_probs=expected_log_probs, + take_step=take_short_sequence_step, + ) + + self.beam_search.min_steps = 1 + expected_top_k = np.array([[1, 5]]) + expected_log_probs = np.log(np.array([0.09])) + self._check_results( + expected_top_k=expected_top_k, + expected_log_probs=expected_log_probs, + take_step=take_short_sequence_step, + ) + + self.beam_search.min_steps = 2 + expected_top_k = np.array([[1, 2, 5]]) + expected_log_probs = np.log(np.array([0.009])) + self._check_results( + expected_top_k=expected_top_k, + expected_log_probs=expected_log_probs, + take_step=take_short_sequence_step, + ) + + self.beam_search.beam_size = 3 + self.beam_search.min_steps = 2 + expected_top_k = np.array([[1, 2, 5, 5, 5], [1, 2, 3, 5, 5], [1, 2, 3, 4, 5]]) + expected_log_probs = np.log(np.array([0.009, 0.0009, 0.0001])) + self._check_results( + expected_top_k=expected_top_k, + expected_log_probs=expected_log_probs, + take_step=take_short_sequence_step, + ) + def test_different_per_node_beam_size(self): # per_node_beam_size = 1 beam_search = BeamSearch(self.end_index, beam_size=3, per_node_beam_size=1) From 3b08851b4cb3998b8ac698ff99f03c6a26f20e86 Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 14:32:10 -0400 Subject: [PATCH 3/7] Reformatting --- tests/nn/beam_search_test.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/nn/beam_search_test.py b/tests/nn/beam_search_test.py index 8805e2be26d..4fcd892ab91 100644 --- a/tests/nn/beam_search_test.py +++ b/tests/nn/beam_search_test.py @@ -248,13 +248,9 @@ def test_take_short_sequence_step(self): transition matrix is expected """ self.beam_search.beam_size = 5 - expected_top_k = np.array([ - [5, 5, 5, 5, 5], - [1, 5, 5, 5, 5], - [1, 2, 5, 5, 5], - [1, 2, 3, 5, 5], - [1, 2, 3, 4, 5] - ]) + expected_top_k = np.array( + [[5, 5, 5, 5, 5], [1, 5, 5, 5, 5], [1, 2, 5, 5, 5], [1, 2, 3, 5, 5], [1, 2, 3, 4, 5]] + ) expected_log_probs = np.log(np.array([0.9, 0.09, 0.009, 0.0009, 0.0001])) self._check_results( expected_top_k=expected_top_k, From 9e5238a02733da94ca2ab33526326c1e39c3338d Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 14:35:43 -0400 Subject: [PATCH 4/7] Adding entry to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a55610dea5e..ebeaf83c3db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 You can do this by setting the parameter `load_weights` to `False`. See [PR #5172](https://github.com/allenai/allennlp/pull/5172) for more details. - Added `SpanExtractorWithSpanWidthEmbedding`, putting specific span embedding computations into the `_embed_spans` method and leaving the common code in `SpanExtractorWithSpanWidthEmbedding` to unify the arguments, and modified `BidirectionalEndpointSpanExtractor`, `EndpointSpanExtractor` and `SelfAttentiveSpanExtractor` accordingly. Now, `SelfAttentiveSpanExtractor` can also embed span widths. +- Added a `min_steps` parameter to `BeamSearch` to set a minimum length for the predicted sequences. ### Fixed From 8e9654397dbcedd179592f1a5aec9565c259e9af Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 15:21:34 -0400 Subject: [PATCH 5/7] Adding end token comment --- allennlp/nn/beam_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index 1e7106aa886..98c7038767f 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -465,7 +465,7 @@ class BeamSearch(FromParams): min_steps : `int`, optional (default = `0`) The minimum number of decoding steps to take, i.e. the minimum length of - the predicted sequences. + the predicted sequences. This does not include the end token. """ def __init__( From 90c8cf7e0f2aa51b1e221879073b700eba833ec8 Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 15:28:49 -0400 Subject: [PATCH 6/7] Adding start token comment --- allennlp/nn/beam_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index 98c7038767f..161134ffb2b 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -465,7 +465,7 @@ class BeamSearch(FromParams): min_steps : `int`, optional (default = `0`) The minimum number of decoding steps to take, i.e. the minimum length of - the predicted sequences. This does not include the end token. + the predicted sequences. This does not include the start or end tokens. """ def __init__( From 0014f72895368319363a020129f71fdef1475b8e Mon Sep 17 00:00:00 2001 From: Daniel Deutsch Date: Mon, 17 May 2021 17:09:53 -0400 Subject: [PATCH 7/7] Changing param to optional --- allennlp/nn/beam_search.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index 161134ffb2b..f1d43226a83 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -1,5 +1,5 @@ from inspect import signature -from typing import List, Callable, Tuple, Dict, cast, TypeVar +from typing import List, Callable, Tuple, Dict, cast, TypeVar, Optional import warnings from overrides import overrides @@ -463,9 +463,10 @@ class BeamSearch(FromParams): Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039). - min_steps : `int`, optional (default = `0`) + min_steps : `int`, optional (default = `None`) The minimum number of decoding steps to take, i.e. the minimum length of - the predicted sequences. This does not include the start or end tokens. + the predicted sequences. This does not include the start or end tokens. If `None`, + no minimum is enforced. """ def __init__( @@ -475,7 +476,7 @@ def __init__( beam_size: int = 10, per_node_beam_size: int = None, sampler: Sampler = None, - min_steps: int = 0, + min_steps: Optional[int] = None, ) -> None: if not max_steps > 0: raise ValueError("max_steps must be positive") @@ -483,17 +484,18 @@ def __init__( raise ValueError("beam_size must be positive") if per_node_beam_size is not None and not per_node_beam_size > 0: raise ValueError("per_node_beam_size must be positive") - if not min_steps >= 0: - raise ValueError("min_steps must be non-negative") - if not min_steps <= max_steps: - raise ValueError("min_steps must be less than or equal to max_steps") + if min_steps is not None: + if not min_steps >= 0: + raise ValueError("min_steps must be non-negative") + if not min_steps <= max_steps: + raise ValueError("min_steps must be less than or equal to max_steps") self._end_index = end_index self.max_steps = max_steps self.beam_size = beam_size self.per_node_beam_size = per_node_beam_size or beam_size self.sampler = sampler or DeterministicSampler() - self.min_steps = min_steps + self.min_steps = min_steps or 0 @staticmethod def _reconstruct_sequences(predictions, backpointers):