From c7b9f44eb6dda5199454fee7af18e03b0656607a Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 May 2020 17:54:08 -0700 Subject: [PATCH 1/6] modify behavior of deepcopy on TextField --- allennlp/data/fields/text_field.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index b326cd798ca..92ed9575434 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -3,6 +3,7 @@ standard word vectors, or pass through an LSTM. """ from collections import defaultdict +from copy import deepcopy from typing import Dict, List, Optional, Iterator import textwrap @@ -153,3 +154,16 @@ def __getitem__(self, idx: int) -> Token: def __len__(self) -> int: return len(self.tokens) + + def __deepcopy__(self, memo): + """ + Overrides the behavior of `deepcopy` so that `self._token_indexers` won't + actually be deep-copied. + + Not only would it be extremely inefficient to deep-copy the token indexers, + but it also fails in many cases since some tokenizers (like those used in + the 'transformers' lib) cannot actually be deep-copied. + """ + new = TextField(deepcopy(self.tokens, memo), self._token_indexers) + new._indexed_tokens = deepcopy(self._indexed_tokens, memo) + return new From 2f27a0c224a43a08510807ed4e91e76040795fbf Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 May 2020 17:55:13 -0700 Subject: [PATCH 2/6] update CHANGELOG --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5eb19c7202..5ed9fe79cd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Fixed -- Nothing yet + +- A bug where `TextField`s could not be deep-copied since some tokenizers cannot be deep-copied. + See https://github.com/allenai/allennlp/issues/4270. ### Added + - Nothing yet ### Changed + - Nothing yet ## [v1.0.0rc5](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc5) - 2020-05-26 From 8c6949ced0b9ba822ce17755f973d59cf72f6f02 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 May 2020 18:00:08 -0700 Subject: [PATCH 3/6] make a little more robust --- allennlp/data/fields/text_field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 92ed9575434..36cf96ff401 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -164,6 +164,6 @@ def __deepcopy__(self, memo): but it also fails in many cases since some tokenizers (like those used in the 'transformers' lib) cannot actually be deep-copied. """ - new = TextField(deepcopy(self.tokens, memo), self._token_indexers) + new = TextField(deepcopy(self.tokens, memo), {k: v for k, v in self._token_indexers}) new._indexed_tokens = deepcopy(self._indexed_tokens, memo) return new From 44b81205070ddde685826d92ae9b78be8de787c6 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 May 2020 18:09:11 -0700 Subject: [PATCH 4/6] add 'duplicate' method --- allennlp/data/fields/field.py | 4 ++++ allennlp/data/fields/text_field.py | 9 +++++---- allennlp/data/instance.py | 5 +++++ allennlp/predictors/sentence_tagger.py | 3 +-- allennlp/predictors/text_classifier.py | 3 +-- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/allennlp/data/fields/field.py b/allennlp/data/fields/field.py index 358bd89937e..b7ac31d6b80 100644 --- a/allennlp/data/fields/field.py +++ b/allennlp/data/fields/field.py @@ -1,3 +1,4 @@ +from copy import deepcopy from typing import Dict, Generic, List, TypeVar import torch @@ -120,3 +121,6 @@ def __eq__(self, other) -> bool: def __len__(self): raise NotImplementedError + + def duplicate(self): + return deepcopy(self) diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 36cf96ff401..80a5d238c48 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -155,15 +155,16 @@ def __getitem__(self, idx: int) -> Token: def __len__(self) -> int: return len(self.tokens) - def __deepcopy__(self, memo): + @overrides + def duplicate(self): """ - Overrides the behavior of `deepcopy` so that `self._token_indexers` won't + Overrides the behavior of `duplicate` so that `self._token_indexers` won't actually be deep-copied. Not only would it be extremely inefficient to deep-copy the token indexers, but it also fails in many cases since some tokenizers (like those used in the 'transformers' lib) cannot actually be deep-copied. """ - new = TextField(deepcopy(self.tokens, memo), {k: v for k, v in self._token_indexers}) - new._indexed_tokens = deepcopy(self._indexed_tokens, memo) + new = TextField(deepcopy(self.tokens), {k: v for k, v in self._token_indexers.items()}) + new._indexed_tokens = deepcopy(self._indexed_tokens) return new diff --git a/allennlp/data/instance.py b/allennlp/data/instance.py index 84d341b6c3f..6051fae9bc1 100644 --- a/allennlp/data/instance.py +++ b/allennlp/data/instance.py @@ -104,3 +104,8 @@ def __str__(self) -> str: return " ".join( [base_string] + [f"\t {name}: {field} \n" for name, field in self.fields.items()] ) + + def duplicate(self) -> "Instance": + new = Instance({k: field.duplicate() for k, field in self.fields.items()}) + new.indexed = self.indexed + return new diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py index 0129a0ca21d..8f74044323d 100644 --- a/allennlp/predictors/sentence_tagger.py +++ b/allennlp/predictors/sentence_tagger.py @@ -1,5 +1,4 @@ from typing import List, Dict -from copy import deepcopy from overrides import overrides import numpy @@ -105,7 +104,7 @@ def predictions_to_labeled_instances( # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: - new_instance = deepcopy(instance) + new_instance = instance.duplicate() text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field( "tags", SequenceLabelField(labels, text_field), self._model.vocab diff --git a/allennlp/predictors/text_classifier.py b/allennlp/predictors/text_classifier.py index 36c33186aae..f1a7db07e0a 100644 --- a/allennlp/predictors/text_classifier.py +++ b/allennlp/predictors/text_classifier.py @@ -1,4 +1,3 @@ -from copy import deepcopy from typing import List, Dict from overrides import overrides @@ -42,7 +41,7 @@ def _json_to_instance(self, json_dict: JsonDict) -> Instance: def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ) -> List[Instance]: - new_instance = deepcopy(instance) + new_instance = instance.duplicate() label = numpy.argmax(outputs["probs"]) new_instance.add_field("label", LabelField(int(label), skip_indexing=True)) return [new_instance] From 09460ad6e307acae4561965a4561ebafa8123bea Mon Sep 17 00:00:00 2001 From: epwalsh Date: Tue, 26 May 2020 18:10:38 -0700 Subject: [PATCH 5/6] update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ed9fe79cd0..12c49670e3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- A bug where `TextField`s could not be deep-copied since some tokenizers cannot be deep-copied. +- A bug where `TextField`s could not be duplicated since some tokenizers cannot be deep-copied. See https://github.com/allenai/allennlp/issues/4270. ### Added -- Nothing yet +- A `duplicate()` method on `Instance`s and `Field`s, to be used instead of `copy.deepcopy()`. ### Changed From dacd14ab003e46fd7a7710c1d435af9ccbbb196d Mon Sep 17 00:00:00 2001 From: epwalsh Date: Wed, 27 May 2020 08:52:52 -0700 Subject: [PATCH 6/6] add a test --- tests/data/instance_test.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/data/instance_test.py b/tests/data/instance_test.py index e9e025b79a5..7c8ef03f29f 100644 --- a/tests/data/instance_test.py +++ b/tests/data/instance_test.py @@ -1,6 +1,7 @@ from allennlp.common.testing import AllenNlpTestCase from allennlp.data import Instance from allennlp.data.fields import TextField, LabelField +from allennlp.data.token_indexers import PretrainedTransformerIndexer from allennlp.data.tokenizers import Token @@ -20,3 +21,22 @@ def test_instance_implements_mutable_mapping(self): values = [v for k, v in instance.items()] assert words_field in values assert label_field in values + + def test_duplicate(self): + # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in + # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. + instance = Instance( + { + "words": TextField( + [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")} + ) + } + ) + + other = instance.duplicate() + assert other == instance + + # Adding new fields to the original instance should not effect the duplicate. + instance.add_field("labels", LabelField("some_label")) + assert "labels" not in other.fields + assert other != instance # sanity check on the '__eq__' method.