Adds a "duplicate()" method on instances and fields (#4294)

* modify behavior of deepcopy on TextField * update CHANGELOG * make a little more robust * add 'duplicate' method * update CHANGELOG * add a test
allenai · May 27, 2020 · 79999ec · 79999ec
1 parent 8ff47d3
commit 79999ec
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,12 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 
 ### Fixed
-- Nothing yet
+
+- A bug where `TextField`s could not be duplicated since some tokenizers cannot be deep-copied.
+  See https://github.com/allenai/allennlp/issues/4270.
 
 ### Added
-- Nothing yet
+
+- A `duplicate()` method on `Instance`s and `Field`s, to be used instead of `copy.deepcopy()`.
 
 ### Changed
+
 - Nothing yet
 
 ## [v1.0.0rc5](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc5) - 2020-05-26

diff --git a/allennlp/data/fields/field.py b/allennlp/data/fields/field.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import Dict, Generic, List, TypeVar
 
 import torch
@@ -120,3 +121,6 @@ def __eq__(self, other) -> bool:
 
     def __len__(self):
         raise NotImplementedError
+
+    def duplicate(self):
+        return deepcopy(self)
diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py
@@ -3,6 +3,7 @@
 standard word vectors, or pass through an LSTM.
 """
 from collections import defaultdict
+from copy import deepcopy
 from typing import Dict, List, Optional, Iterator
 import textwrap
 
@@ -153,3 +154,17 @@ def __getitem__(self, idx: int) -> Token:
 
     def __len__(self) -> int:
         return len(self.tokens)
+
+    @overrides
+    def duplicate(self):
+        """
+        Overrides the behavior of `duplicate` so that `self._token_indexers` won't
+        actually be deep-copied.
+
+        Not only would it be extremely inefficient to deep-copy the token indexers,
+        but it also fails in many cases since some tokenizers (like those used in
+        the 'transformers' lib) cannot actually be deep-copied.
+        """
+        new = TextField(deepcopy(self.tokens), {k: v for k, v in self._token_indexers.items()})
+        new._indexed_tokens = deepcopy(self._indexed_tokens)
+        return new
diff --git a/allennlp/data/instance.py b/allennlp/data/instance.py
@@ -104,3 +104,8 @@ def __str__(self) -> str:
         return " ".join(
             [base_string] + [f"\t {name}: {field} \n" for name, field in self.fields.items()]
         )
+
+    def duplicate(self) -> "Instance":
+        new = Instance({k: field.duplicate() for k, field in self.fields.items()})
+        new.indexed = self.indexed
+        return new
diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py
@@ -1,5 +1,4 @@
 from typing import List, Dict
-from copy import deepcopy
 
 from overrides import overrides
 import numpy
@@ -105,7 +104,7 @@ def predictions_to_labeled_instances(
         # Creates a new instance for each contiguous tag
         instances = []
         for labels in predicted_spans:
-            new_instance = deepcopy(instance)
+            new_instance = instance.duplicate()
             text_field: TextField = instance["tokens"]  # type: ignore
             new_instance.add_field(
                 "tags", SequenceLabelField(labels, text_field), self._model.vocab

diff --git a/allennlp/predictors/text_classifier.py b/allennlp/predictors/text_classifier.py
@@ -1,4 +1,3 @@
-from copy import deepcopy
 from typing import List, Dict
 
 from overrides import overrides
@@ -42,7 +41,7 @@ def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     def predictions_to_labeled_instances(
         self, instance: Instance, outputs: Dict[str, numpy.ndarray]
     ) -> List[Instance]:
-        new_instance = deepcopy(instance)
+        new_instance = instance.duplicate()
         label = numpy.argmax(outputs["probs"])
         new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
         return [new_instance]
diff --git a/tests/data/instance_test.py b/tests/data/instance_test.py
@@ -1,6 +1,7 @@
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data import Instance
 from allennlp.data.fields import TextField, LabelField
+from allennlp.data.token_indexers import PretrainedTransformerIndexer
 from allennlp.data.tokenizers import Token
 
 
@@ -20,3 +21,22 @@ def test_instance_implements_mutable_mapping(self):
         values = [v for k, v in instance.items()]
         assert words_field in values
         assert label_field in values
+
+    def test_duplicate(self):
+        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
+        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
+        instance = Instance(
+            {
+                "words": TextField(
+                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
+                )
+            }
+        )
+
+        other = instance.duplicate()
+        assert other == instance
+
+        # Adding new fields to the original instance should not effect the duplicate.
+        instance.add_field("labels", LabelField("some_label"))
+        assert "labels" not in other.fields
+        assert other != instance  # sanity check on the '__eq__' method.