Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Create Vocabulary from both pretrained transformers and instances (#5368
Browse files Browse the repository at this point in the history
)

* Add Vocabulary constructor from both pretrained transformers and instances

* undo autoformatting on changelog (sorry!)

* update changelog without autoformatting everything

* Remove allowing multiple pretrained transformers to a single namespace

See #5368 (comment) for more information
  • Loading branch information
amitkparekh authored Aug 24, 2021
1 parent 5dc80a6 commit 75af38e
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `ScaledDotProductMatrixAttention`, and converted the transformer toolkit to use it
- Added tests to ensure that all `Attention` and `MatrixAttention` implementations are interchangeable
- Added a way for AllenNLP Tango to read and write datasets lazily.
- Added `from_pretrained_transformer_and_instances` constructor to `Vocabulary`

### Fixed

Expand Down
75 changes: 75 additions & 0 deletions allennlp/data/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,77 @@ def from_files_and_instances(
)
return vocab

@classmethod
def from_pretrained_transformer_and_instances(
cls,
instances: Iterable["adi.Instance"],
transformers: Dict[str, str],
min_count: Dict[str, int] = None,
max_vocab_size: Union[int, Dict[str, int]] = None,
non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
pretrained_files: Optional[Dict[str, str]] = None,
only_include_pretrained_words: bool = False,
tokens_to_add: Dict[str, List[str]] = None,
min_pretrained_embeddings: Dict[str, int] = None,
padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
) -> "Vocabulary":
"""
Construct a vocabulary given a collection of `Instance`'s and some parameters. Then extends
it with generated vocabularies from pretrained transformers.
Vocabulary from instances is constructed by passing parameters to :func:`from_instances`,
and then updated by including merging in vocabularies from
:func:`from_pretrained_transformer`. See other methods for full descriptions for what the
other parameters do.
The `instances` parameters does not get an entry in a typical AllenNLP configuration file,
other parameters do (if you want non-default parameters).
# Parameters
transformers : `Dict[str, str]`
Dictionary mapping the vocab namespaces (keys) to a transformer model name (value).
Namespaces not included will be ignored.
# Examples
You can use this constructor by modifying the following example within your training
configuration.
```jsonnet
{
vocabulary: {
type: 'from_pretrained_transformer_and_instances',
transformers: {
'namespace1': 'bert-base-cased',
'namespace2': 'roberta-base',
},
}
}
```
"""
vocab = cls.from_instances(
instances=instances,
min_count=min_count,
max_vocab_size=max_vocab_size,
non_padded_namespaces=non_padded_namespaces,
pretrained_files=pretrained_files,
only_include_pretrained_words=only_include_pretrained_words,
tokens_to_add=tokens_to_add,
min_pretrained_embeddings=min_pretrained_embeddings,
padding_token=padding_token,
oov_token=oov_token,
)

for namespace, model_name in transformers.items():
transformer_vocab = cls.from_pretrained_transformer(
model_name=model_name, namespace=namespace
)
vocab.extend_from_vocab(transformer_vocab)

return vocab

@classmethod
def empty(cls) -> "Vocabulary":
"""
Expand Down Expand Up @@ -810,6 +881,10 @@ def print_statistics(self) -> None:
Vocabulary.register("from_pretrained_transformer", constructor="from_pretrained_transformer")(
Vocabulary
)
Vocabulary.register(
"from_pretrained_transformer_and_instances",
constructor="from_pretrained_transformer_and_instances",
)(Vocabulary)
Vocabulary.register("from_instances", constructor="from_instances")(Vocabulary)
Vocabulary.register("from_files", constructor="from_files")(Vocabulary)
Vocabulary.register("extend", constructor="from_files_and_instances")(Vocabulary)
Expand Down
92 changes: 92 additions & 0 deletions tests/data/vocabulary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,95 @@ def test_from_pretrained_transformer(self, model_name):

vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab")
assert vocab1._token_to_index[namespace] == tokenizer.get_vocab()


class TestVocabularyFromPretrainedTransformerAndInstances(AllenNlpTestCase):
def setup_method(self):
super().setup_method()

# Create dataset with single namespace
token_indexer_1 = SingleIdTokenIndexer("namespace_1")
text_field_1 = TextField(
[Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
{"namespace_1": token_indexer_1},
)
single_field_instance = Instance({"text": text_field_1})
self.single_namespace_dataset = Batch([single_field_instance])

# Create dataset with multiple namespaces
token_indexer_2 = SingleIdTokenIndexer("namespace_2")
text_field_2 = TextField(
[Token(t) for t in ["d", "d", "d", "d", "e", "e", "f", "f", "f"]],
{"namespace_2": token_indexer_2},
)
multiple_field_instance = Instance(
{"first_text": text_field_1, "second_text": text_field_2}
)
self.multiple_namespace_dataset = Batch([multiple_field_instance])

@staticmethod
def _get_expected_vocab(dataset, namespace, model_name):
vocab_from_instances = Vocabulary.from_instances(dataset)
instance_tokens = set(vocab_from_instances._token_to_index[namespace].keys())
transformer_tokens = set(
Vocabulary.from_pretrained_transformer(model_name, namespace)
._token_to_index[namespace]
.keys()
)
return instance_tokens.union(transformer_tokens)

def _get_expected_vocab_size(self, dataset, namespace, model_name):
return len(self._get_expected_vocab(dataset, namespace, model_name))

@pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
def test_with_single_namespace_and_single_model(self, model_name):
dataset = self.single_namespace_dataset
namespace = "namespace_1"

expected_vocab_size = self._get_expected_vocab_size(dataset, namespace, model_name)

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace: model_name}
)

assert vocab.get_vocab_size(namespace) == expected_vocab_size

@pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
def test_only_updates_single_namespace_when_multiple_present(self, model_name):
dataset = self.multiple_namespace_dataset
namespace1 = "namespace_1"
namespace2 = "namespace_2"

namespace1_vocab_size = self._get_expected_vocab_size(dataset, namespace1, model_name)
namespace2_vocab_size = Vocabulary.from_instances(dataset).get_vocab_size("namespace_2")

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace1: model_name}
)

# Make sure only the desired namespace is extended
assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size

@pytest.mark.parametrize("namespace1_model_name", ["bert-base-cased", "roberta-base"])
@pytest.mark.parametrize("namespace2_model_name", ["bert-base-cased", "roberta-base"])
def test_with_different_models_per_namespace(
self, namespace1_model_name, namespace2_model_name
):
dataset = self.multiple_namespace_dataset
namespace1 = "namespace_1"
namespace2 = "namespace_2"

namespace1_vocab_size = self._get_expected_vocab_size(
dataset, namespace1, namespace1_model_name
)
namespace2_vocab_size = self._get_expected_vocab_size(
dataset, namespace2, namespace2_model_name
)

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace1: namespace1_model_name, namespace2: namespace2_model_name}
)

assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size

0 comments on commit 75af38e

Please sign in to comment.