From c7b9f44eb6dda5199454fee7af18e03b0656607a Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Tue, 26 May 2020 17:54:08 -0700
Subject: [PATCH 1/6] modify behavior of deepcopy on TextField

---
 allennlp/data/fields/text_field.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py
index b326cd798ca..92ed9575434 100644
--- a/allennlp/data/fields/text_field.py
+++ b/allennlp/data/fields/text_field.py
@@ -3,6 +3,7 @@
 standard word vectors, or pass through an LSTM.
 """
 from collections import defaultdict
+from copy import deepcopy
 from typing import Dict, List, Optional, Iterator
 import textwrap
 
@@ -153,3 +154,16 @@ def __getitem__(self, idx: int) -> Token:
 
     def __len__(self) -> int:
         return len(self.tokens)
+
+    def __deepcopy__(self, memo):
+        """
+        Overrides the behavior of `deepcopy` so that `self._token_indexers` won't
+        actually be deep-copied.
+
+        Not only would it be extremely inefficient to deep-copy the token indexers,
+        but it also fails in many cases since some tokenizers (like those used in
+        the 'transformers' lib) cannot actually be deep-copied.
+        """
+        new = TextField(deepcopy(self.tokens, memo), self._token_indexers)
+        new._indexed_tokens = deepcopy(self._indexed_tokens, memo)
+        return new

From 2f27a0c224a43a08510807ed4e91e76040795fbf Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Tue, 26 May 2020 17:55:13 -0700
Subject: [PATCH 2/6] update CHANGELOG

---
 CHANGELOG.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f5eb19c7202..5ed9fe79cd0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,12 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## Unreleased
 
 ### Fixed
-- Nothing yet
+
+- A bug where `TextField`s could not be deep-copied since some tokenizers cannot be deep-copied.
+  See https://github.com/allenai/allennlp/issues/4270.
 
 ### Added
+
 - Nothing yet
 
 ### Changed
+
 - Nothing yet
 
 ## [v1.0.0rc5](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc5) - 2020-05-26

From 8c6949ced0b9ba822ce17755f973d59cf72f6f02 Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Tue, 26 May 2020 18:00:08 -0700
Subject: [PATCH 3/6] make a little more robust

---
 allennlp/data/fields/text_field.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py
index 92ed9575434..36cf96ff401 100644
--- a/allennlp/data/fields/text_field.py
+++ b/allennlp/data/fields/text_field.py
@@ -164,6 +164,6 @@ def __deepcopy__(self, memo):
         but it also fails in many cases since some tokenizers (like those used in
         the 'transformers' lib) cannot actually be deep-copied.
         """
-        new = TextField(deepcopy(self.tokens, memo), self._token_indexers)
+        new = TextField(deepcopy(self.tokens, memo), {k: v for k, v in self._token_indexers})
         new._indexed_tokens = deepcopy(self._indexed_tokens, memo)
         return new

From 44b81205070ddde685826d92ae9b78be8de787c6 Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Tue, 26 May 2020 18:09:11 -0700
Subject: [PATCH 4/6] add 'duplicate' method

---
 allennlp/data/fields/field.py          | 4 ++++
 allennlp/data/fields/text_field.py     | 9 +++++----
 allennlp/data/instance.py              | 5 +++++
 allennlp/predictors/sentence_tagger.py | 3 +--
 allennlp/predictors/text_classifier.py | 3 +--
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/allennlp/data/fields/field.py b/allennlp/data/fields/field.py
index 358bd89937e..b7ac31d6b80 100644
--- a/allennlp/data/fields/field.py
+++ b/allennlp/data/fields/field.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import Dict, Generic, List, TypeVar
 
 import torch
@@ -120,3 +121,6 @@ def __eq__(self, other) -> bool:
 
     def __len__(self):
         raise NotImplementedError
+
+    def duplicate(self):
+        return deepcopy(self)
diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py
index 36cf96ff401..80a5d238c48 100644
--- a/allennlp/data/fields/text_field.py
+++ b/allennlp/data/fields/text_field.py
@@ -155,15 +155,16 @@ def __getitem__(self, idx: int) -> Token:
     def __len__(self) -> int:
         return len(self.tokens)
 
-    def __deepcopy__(self, memo):
+    @overrides
+    def duplicate(self):
         """
-        Overrides the behavior of `deepcopy` so that `self._token_indexers` won't
+        Overrides the behavior of `duplicate` so that `self._token_indexers` won't
         actually be deep-copied.
 
         Not only would it be extremely inefficient to deep-copy the token indexers,
         but it also fails in many cases since some tokenizers (like those used in
         the 'transformers' lib) cannot actually be deep-copied.
         """
-        new = TextField(deepcopy(self.tokens, memo), {k: v for k, v in self._token_indexers})
-        new._indexed_tokens = deepcopy(self._indexed_tokens, memo)
+        new = TextField(deepcopy(self.tokens), {k: v for k, v in self._token_indexers.items()})
+        new._indexed_tokens = deepcopy(self._indexed_tokens)
         return new
diff --git a/allennlp/data/instance.py b/allennlp/data/instance.py
index 84d341b6c3f..6051fae9bc1 100644
--- a/allennlp/data/instance.py
+++ b/allennlp/data/instance.py
@@ -104,3 +104,8 @@ def __str__(self) -> str:
         return " ".join(
             [base_string] + [f"\t {name}: {field} \n" for name, field in self.fields.items()]
         )
+
+    def duplicate(self) -> "Instance":
+        new = Instance({k: field.duplicate() for k, field in self.fields.items()})
+        new.indexed = self.indexed
+        return new
diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py
index 0129a0ca21d..8f74044323d 100644
--- a/allennlp/predictors/sentence_tagger.py
+++ b/allennlp/predictors/sentence_tagger.py
@@ -1,5 +1,4 @@
 from typing import List, Dict
-from copy import deepcopy
 
 from overrides import overrides
 import numpy
@@ -105,7 +104,7 @@ def predictions_to_labeled_instances(
         # Creates a new instance for each contiguous tag
         instances = []
         for labels in predicted_spans:
-            new_instance = deepcopy(instance)
+            new_instance = instance.duplicate()
             text_field: TextField = instance["tokens"]  # type: ignore
             new_instance.add_field(
                 "tags", SequenceLabelField(labels, text_field), self._model.vocab
diff --git a/allennlp/predictors/text_classifier.py b/allennlp/predictors/text_classifier.py
index 36c33186aae..f1a7db07e0a 100644
--- a/allennlp/predictors/text_classifier.py
+++ b/allennlp/predictors/text_classifier.py
@@ -1,4 +1,3 @@
-from copy import deepcopy
 from typing import List, Dict
 
 from overrides import overrides
@@ -42,7 +41,7 @@ def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     def predictions_to_labeled_instances(
         self, instance: Instance, outputs: Dict[str, numpy.ndarray]
     ) -> List[Instance]:
-        new_instance = deepcopy(instance)
+        new_instance = instance.duplicate()
         label = numpy.argmax(outputs["probs"])
         new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
         return [new_instance]

From 09460ad6e307acae4561965a4561ebafa8123bea Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Tue, 26 May 2020 18:10:38 -0700
Subject: [PATCH 5/6] update CHANGELOG

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ed9fe79cd0..12c49670e3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,12 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- A bug where `TextField`s could not be deep-copied since some tokenizers cannot be deep-copied.
+- A bug where `TextField`s could not be duplicated since some tokenizers cannot be deep-copied.
   See https://github.com/allenai/allennlp/issues/4270.
 
 ### Added
 
-- Nothing yet
+- A `duplicate()` method on `Instance`s and `Field`s, to be used instead of `copy.deepcopy()`.
 
 ### Changed
 

From dacd14ab003e46fd7a7710c1d435af9ccbbb196d Mon Sep 17 00:00:00 2001
From: epwalsh <epwalsh10@gmail.com>
Date: Wed, 27 May 2020 08:52:52 -0700
Subject: [PATCH 6/6] add a test

---
 tests/data/instance_test.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/data/instance_test.py b/tests/data/instance_test.py
index e9e025b79a5..7c8ef03f29f 100644
--- a/tests/data/instance_test.py
+++ b/tests/data/instance_test.py
@@ -1,6 +1,7 @@
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data import Instance
 from allennlp.data.fields import TextField, LabelField
+from allennlp.data.token_indexers import PretrainedTransformerIndexer
 from allennlp.data.tokenizers import Token
 
 
@@ -20,3 +21,22 @@ def test_instance_implements_mutable_mapping(self):
         values = [v for k, v in instance.items()]
         assert words_field in values
         assert label_field in values
+
+    def test_duplicate(self):
+        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
+        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
+        instance = Instance(
+            {
+                "words": TextField(
+                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
+                )
+            }
+        )
+
+        other = instance.duplicate()
+        assert other == instance
+
+        # Adding new fields to the original instance should not effect the duplicate.
+        instance.add_field("labels", LabelField("some_label"))
+        assert "labels" not in other.fields
+        assert other != instance  # sanity check on the '__eq__' method.