From a85e5ca7a7687691c4d47b868ed20a2085282011 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 17 Jul 2019 11:23:46 +0200 Subject: [PATCH 1/6] pip: pytorch-pretrained-BERT -> pytorch-transformers change --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fd68303deb..33851a01f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ sklearn sqlitedict>=1.6.0 deprecated>=1.2.4 hyperopt>=0.1.1 -pytorch-pretrained-bert>=0.6.1 +pytorch-transformers>=1.0.0 bpemb>=0.2.9 regex tabulate From 3bb7f002845aca44e8a363729644a0823e1c48af Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 17 Jul 2019 11:29:53 +0200 Subject: [PATCH 2/6] GH-873: Introduce new PyTorch-Transformers changes. The following Transformer-based architectures are now supported via pytorch-transformers: - BertEmbeddings (Updated API) - OpenAIGPTEmbeddings (Updated API, various fixes) - OpenAIGPT2Embeddings (New) - TransformerXLEmbeddings (Updated API, tokenization fixes) - XLNetEmbeddings (New) - XLMEmbeddings (New) - RoBERTaEmbeddings (New, via torch.hub module) It also possible to use a scalar mix of specified layers from the Transformer-based models. Scalar mix is proposed by Liu et al. (2019). The scalar mix implementation is copied and slightly modified from the allennlp repo (Apache 2.0 license). --- flair/embeddings.py | 681 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 626 insertions(+), 55 deletions(-) diff --git a/flair/embeddings.py b/flair/embeddings.py index 87031c0227..5e5eaec1d2 100644 --- a/flair/embeddings.py +++ b/flair/embeddings.py @@ -11,23 +11,46 @@ import torch from bpemb import BPEmb from deprecated import deprecated +from torch.nn import ParameterList, Parameter -from pytorch_pretrained_bert import ( +from pytorch_transformers import ( BertTokenizer, BertModel, TransfoXLTokenizer, TransfoXLModel, OpenAIGPTModel, OpenAIGPTTokenizer, + GPT2Model, + GPT2Tokenizer, + XLNetTokenizer, + XLMTokenizer, + XLNetModel, + XLMModel, + PreTrainedTokenizer, + PreTrainedModel, ) -from pytorch_pretrained_bert.modeling_openai import ( - PRETRAINED_MODEL_ARCHIVE_MAP as OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, +from pytorch_transformers.modeling_openai import ( + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP as OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, ) -from pytorch_pretrained_bert.modeling_transfo_xl import ( - PRETRAINED_MODEL_ARCHIVE_MAP as TRANSFORMER_XL_PRETRAINED_MODEL_ARCHIVE_MAP, +from pytorch_transformers.modeling_gpt2 import ( + GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP as OPENAI_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, ) + + +from pytorch_transformers.modeling_transfo_xl import ( + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP as TRANSFORMER_XL_PRETRAINED_MODEL_ARCHIVE_MAP, +) + +from pytorch_transformers.modeling_xlnet import ( + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP as XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, +) + +from pytorch_transformers.modeling_xlm import ( + XLM_PRETRAINED_MODEL_ARCHIVE_MAP as XLM_PRETRAINED_MODEL_ARCHIVE_MAP, +) + from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import flair @@ -865,10 +888,292 @@ def __str__(self): return self.name +class ScalarMix(torch.nn.Module): + """ + Computes a parameterised scalar mixture of N tensors. + This method was proposed by Liu et al. (2019) in the paper: + "Linguistic Knowledge and Transferability of Contextual Representations" (https://arxiv.org/abs/1903.08855) + + The implementation is copied and slightly modified from the allennlp repository and is licensed under Apache 2.0. + It can be found under: + https://github.com/allenai/allennlp/blob/master/allennlp/modules/scalar_mix.py. + """ + + def __init__(self, mixture_size: int) -> None: + """ + Inits scalar mix implementation. + ``mixture = gamma * sum(s_k * tensor_k)`` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters. + :param mixture_size: size of mixtures (usually the number of layers) + """ + super(ScalarMix, self).__init__() + self.mixture_size = mixture_size + + initial_scalar_parameters = [0.0] * mixture_size + + self.scalar_parameters = ParameterList( + [ + Parameter( + torch.FloatTensor([initial_scalar_parameters[i]]).to(flair.device), + requires_grad=False, + ) + for i in range(mixture_size) + ] + ) + self.gamma = Parameter( + torch.FloatTensor([1.0]).to(flair.device), requires_grad=False + ) + + def forward(self, tensors: List[torch.Tensor]) -> torch.Tensor: + """ + Computes a weighted average of the ``tensors``. The input tensors an be any shape + with at least two dimensions, but must all be the same shape. + :param tensors: list of input tensors + :return: computed weighted average of input tensors + """ + if len(tensors) != self.mixture_size: + log.error( + "{} tensors were passed, but the module was initialized to mix {} tensors.".format( + len(tensors), self.mixture_size + ) + ) + + normed_weights = torch.nn.functional.softmax( + torch.cat([parameter for parameter in self.scalar_parameters]), dim=0 + ) + normed_weights = torch.split(normed_weights, split_size_or_sections=1) + + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * tensor) + return self.gamma * sum(pieces) + + +def _extract_embeddings( + hidden_states: List[torch.FloatTensor], + layers: List[int], + pooling_operation: str, + subword_start_idx: int, + subword_end_idx: int, + use_scalar_mix: bool = False, +) -> List[torch.FloatTensor]: + """ + Extracts subword embeddings from specified layers from hidden states. + :param hidden_states: list of hidden states from model + :param layers: list of layers + :param pooling_operation: pooling operation for subword embeddings (supported: first, last, first_last and mean) + :param subword_start_idx: defines start index for subword + :param subword_end_idx: defines end index for subword + :param use_scalar_mix: determines, if scalar mix should be used + :return: list of extracted subword embeddings + """ + subtoken_embeddings: List[torch.FloatTensor] = [] + + for layer in layers: + current_embeddings = hidden_states[layer][0][subword_start_idx:subword_end_idx] + + first_embedding: torch.FloatTensor = current_embeddings[0] + if pooling_operation == "first_last": + last_embedding: torch.FloatTensor = current_embeddings[-1] + final_embedding: torch.FloatTensor = torch.cat( + [first_embedding, last_embedding] + ) + elif pooling_operation == "last": + final_embedding: torch.FloatTensor = current_embeddings[-1] + elif pooling_operation == "mean": + all_embeddings: List[torch.FloatTensor] = [ + embedding.unsqueeze(0) for embedding in current_embeddings + ] + final_embedding: torch.FloatTensor = torch.mean( + torch.cat(all_embeddings, dim=0), dim=0 + ) + else: + final_embedding: torch.FloatTensor = first_embedding + + subtoken_embeddings.append(final_embedding) + + if use_scalar_mix: + sm = ScalarMix(mixture_size=len(subtoken_embeddings)) + sm_embeddings = sm(subtoken_embeddings) + + subtoken_embeddings = [sm_embeddings] + + return subtoken_embeddings + + +def _build_token_subwords_mapping( + sentence: Sentence, tokenizer: PreTrainedTokenizer +) -> Dict[int, int]: + """ Builds a dictionary that stores the following information: + Token index (key) and number of corresponding subwords (value) for a sentence. + + :param sentence: input sentence + :param tokenizer: PyTorch-Transformers tokenization object + :return: dictionary of token index to corresponding number of subwords + """ + token_subwords_mapping: Dict[int, int] = {} + + for token in sentence.tokens: + token_text = token.text + + subwords = tokenizer.tokenize(token_text) + + token_subwords_mapping[token.idx] = len(subwords) + + return token_subwords_mapping + + +def _build_token_subwords_mapping_roberta(sentence: Sentence, model) -> Dict[int, int]: + """ Builds a dictionary that stores the following information: + Token index (key) and number of corresponding subwords (value) for a sentence. + + :param sentence: input sentence + :param model: RoBERTa model + :return: dictionary of token index to corresponding number of subwords + """ + token_subwords_mapping: Dict[int, int] = {} + + for token in sentence.tokens: + token_text = token.text + + # Leading spaces are needed for GPT2 BPE tokenization in RoBERTa (except at BOS): + # ``roberta.encode(' world').tolist()`` -> ``[0, 232, 2]`` + # ``roberta.encode('world').tolist()`` -> ``[0, 8331, 2]`` + padding = "" if token.idx == 1 else " " + + current_subwords = model.encode(padding + token_text) + + # ``roberta.encode(' world').tolist()`` will result in ``[0, 232, 2]``: + # 0 and 2 are special symbols (`` and ``), so ignore them in subword length calculation + token_subwords_mapping[token.idx] = len(current_subwords) - 2 + + return token_subwords_mapping + + +def _build_token_subwords_mapping_gpt2( + sentence: Sentence, tokenizer: PreTrainedTokenizer +) -> Dict[int, int]: + """ Builds a dictionary that stores the following information: + Token index (key) and number of corresponding subwords (value) for a sentence. + + :param sentence: input sentence + :param tokenizer: PyTorch-Transformers tokenization object + :return: dictionary of token index to corresponding number of subwords + """ + token_subwords_mapping: Dict[int, int] = {} + + for token in sentence.tokens: + # Dummy token is needed to get the actually token tokenized correctly with special ``Ġ`` symbol + + if token.idx == 1: + token_text = token.text + subwords = tokenizer.tokenize(token_text) + else: + token_text = "X " + token.text + subwords = tokenizer.tokenize(token_text)[1:] + + token_subwords_mapping[token.idx] = len(subwords) + + return token_subwords_mapping + + +def _get_transformer_sentence_embeddings( + sentences: List[Sentence], + tokenizer: PreTrainedTokenizer, + model: PreTrainedModel, + name: str, + layers: List[int], + pooling_operation: str, + use_scalar_mix: bool, + bos_token: str = None, + eos_token: str = None, +) -> List[Sentence]: + """ + Builds sentence embeddings for Transformer-based architectures. + :param sentences: input sentences + :param tokenizer: tokenization object + :param model: model object + :param name: name of the Transformer-based model + :param layers: list of layers + :param pooling_operation: defines pooling operation for subword extraction + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) + :param bos_token: defines begin of sentence token (used for left padding) + :param eos_token: defines end of sentence token (used for right padding) + :return: list of sentences (each token of a sentence is now embedded) + """ + with torch.no_grad(): + for sentence in sentences: + token_subwords_mapping: Dict[int, int] = {} + + if name.startswith("roberta"): + token_subwords_mapping = _build_token_subwords_mapping_roberta( + sentence=sentence, model=model + ) + elif name.startswith("gpt2"): + token_subwords_mapping = _build_token_subwords_mapping_gpt2( + sentence=sentence, tokenizer=tokenizer + ) + else: + token_subwords_mapping = _build_token_subwords_mapping( + sentence=sentence, tokenizer=tokenizer + ) + + if name.startswith("roberta"): + subwords = model.encode(sentence.to_tokenized_string()) + else: + subwords = tokenizer.tokenize(sentence.to_tokenized_string()) + + offset = 0 + + if bos_token: + subwords = [bos_token] + subwords + offset = 1 + + if eos_token: + subwords = subwords + [eos_token] + + if not name.startswith("roberta"): + indexed_tokens = tokenizer.convert_tokens_to_ids(subwords) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + else: + hidden_states = model.extract_features( + subwords, return_all_hiddens=True + ) + offset = 1 + + for token in sentence.tokens: + len_subwords = token_subwords_mapping[token.idx] + + subtoken_embeddings = _extract_embeddings( + hidden_states=hidden_states, + layers=layers, + pooling_operation=pooling_operation, + subword_start_idx=offset, + subword_end_idx=offset + len_subwords, + use_scalar_mix=use_scalar_mix, + ) + + offset += len_subwords + + final_subtoken_embedding = torch.cat(subtoken_embeddings) + token.set_embedding(name, final_subtoken_embedding) + + return sentences + + class TransformerXLEmbeddings(TokenEmbeddings): - def __init__(self, model: str = "transfo-xl-wt103"): + def __init__( + self, + model: str = "transfo-xl-wt103", + layers: str = "1,2,3", + use_scalar_mix: bool = False, + ): """Transformer-XL embeddings, as proposed in Dai et al., 2019. :param model: name of Transformer-XL model + :param layers: comma-separated list of layers + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() @@ -876,8 +1181,12 @@ def __init__(self, model: str = "transfo-xl-wt103"): raise ValueError("Provided Transformer-XL model is not available.") self.tokenizer = TransfoXLTokenizer.from_pretrained(model) - self.model = TransfoXLModel.from_pretrained(model) + self.model = TransfoXLModel.from_pretrained( + pretrained_model_name_or_path=model, output_hidden_states=True + ) self.name = model + self.layers: List[int] = [int(layer) for layer in layers.split(",")] + self.use_scalar_mix = use_scalar_mix self.static_embeddings = True dummy_sentence: Sentence = Sentence() @@ -895,20 +1204,147 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: self.model.to(flair.device) self.model.eval() - with torch.no_grad(): - for sentence in sentences: - token_strings = [token.text for token in sentence.tokens] - indexed_tokens = self.tokenizer.convert_tokens_to_ids(token_strings) + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=self.tokenizer, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation="first", + use_scalar_mix=self.use_scalar_mix, + eos_token="", + ) - tokens_tensor = torch.tensor([indexed_tokens]) - tokens_tensor = tokens_tensor.to(flair.device) + return sentences + + def extra_repr(self): + return "model={}".format(self.name) + + def __str__(self): + return self.name - hidden_states, _ = self.model(tokens_tensor) - for token, token_idx in zip( - sentence.tokens, range(len(sentence.tokens)) - ): - token.set_embedding(self.name, hidden_states[0][token_idx]) +class XLNetEmbeddings(TokenEmbeddings): + def __init__( + self, + model: str = "xlnet-large-cased", + layers: str = "1", + pooling_operation: str = "first_last", + use_scalar_mix: bool = False, + ): + """XLNet embeddings, as proposed in Yang et al., 2019. + :param model: name of XLNet model + :param layers: comma-separated list of layers + :param pooling_operation: defines pooling operation for subwords + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) + """ + super().__init__() + + if model not in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys(): + raise ValueError("Provided XLNet model is not available.") + + self.tokenizer = XLNetTokenizer.from_pretrained(model) + self.model = XLNetModel.from_pretrained( + pretrained_model_name_or_path=model, output_hidden_states=True + ) + self.name = model + self.layers: List[int] = [int(layer) for layer in layers.split(",")] + self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix + self.static_embeddings = True + + dummy_sentence: Sentence = Sentence() + dummy_sentence.add_token(Token("hello")) + embedded_dummy = self.embed(dummy_sentence) + self.__embedding_length: int = len( + embedded_dummy[0].get_token(1).get_embedding() + ) + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: + self.model.to(flair.device) + self.model.eval() + + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=self.tokenizer, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation=self.pooling_operation, + use_scalar_mix=self.use_scalar_mix, + bos_token="", + eos_token="", + ) + + return sentences + + def extra_repr(self): + return "model={}".format(self.name) + + def __str__(self): + return self.name + + +class XLMEmbeddings(TokenEmbeddings): + def __init__( + self, + model: str = "xlm-mlm-en-2048", + layers: str = "1", + pooling_operation: str = "first_last", + use_scalar_mix: bool = False, + ): + """ + XLM embeddings, as proposed in Guillaume et al., 2019. + :param model: name of XLM model + :param layers: comma-separated list of layers + :param pooling_operation: defines pooling operation for subwords + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) + """ + super().__init__() + + if model not in XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys(): + raise ValueError("Provided XLM model is not available.") + + self.tokenizer = XLMTokenizer.from_pretrained(model) + self.model = XLMModel.from_pretrained( + pretrained_model_name_or_path=model, output_hidden_states=True + ) + self.name = model + self.layers: List[int] = [int(layer) for layer in layers.split(",")] + self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix + self.static_embeddings = True + + dummy_sentence: Sentence = Sentence() + dummy_sentence.add_token(Token("hello")) + embedded_dummy = self.embed(dummy_sentence) + self.__embedding_length: int = len( + embedded_dummy[0].get_token(1).get_embedding() + ) + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: + self.model.to(flair.device) + self.model.eval() + + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=self.tokenizer, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation=self.pooling_operation, + use_scalar_mix=self.use_scalar_mix, + bos_token="", + eos_token="", + ) return sentences @@ -921,11 +1357,17 @@ def __str__(self): class OpenAIGPTEmbeddings(TokenEmbeddings): def __init__( - self, model: str = "openai-gpt", pooling_operation: str = "first_last" + self, + model: str = "openai-gpt", + layers: str = "1", + pooling_operation: str = "first_last", + use_scalar_mix: bool = False, ): """OpenAI GPT embeddings, as proposed in Radford et al. 2018. :param model: name of OpenAI GPT model + :param layers: comma-separated list of layers :param pooling_operation: defines pooling operation for subwords + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() @@ -933,10 +1375,14 @@ def __init__( raise ValueError("Provided OpenAI GPT model is not available.") self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model) - self.model = OpenAIGPTModel.from_pretrained(model) + self.model = OpenAIGPTModel.from_pretrained( + pretrained_model_name_or_path=model, output_hidden_states=True + ) self.name = model - self.static_embeddings = True + self.layers: List[int] = [int(layer) for layer in layers.split(",")] self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix + self.static_embeddings = True dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) @@ -953,37 +1399,15 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: self.model.to(flair.device) self.model.eval() - with torch.no_grad(): - for sentence in sentences: - for token in sentence.tokens: - token_text = token.text - - subwords = self.tokenizer.tokenize(token_text) - indexed_tokens = self.tokenizer.convert_tokens_to_ids(subwords) - tokens_tensor = torch.tensor([indexed_tokens]) - tokens_tensor = tokens_tensor.to(flair.device) - - hidden_states = self.model(tokens_tensor) - - if self.pooling_operation == "first": - # Use embedding of first subword - token.set_embedding(self.name, hidden_states[0][0]) - elif self.pooling_operation == "last": - last_embedding = hidden_states[0][len(hidden_states[0]) - 1] - token.set_embedding(self.name, last_embedding) - elif self.pooling_operation == "first_last": - # Use embedding of first and last subword - first_embedding = hidden_states[0][0] - last_embedding = hidden_states[0][len(hidden_states[0]) - 1] - final_embedding = torch.cat([first_embedding, last_embedding]) - token.set_embedding(self.name, final_embedding) - else: - # Otherwise, use mean over all subwords in token - all_embeddings = [ - embedding.unsqueeze(0) for embedding in hidden_states[0] - ] - mean = torch.mean(torch.cat(all_embeddings, dim=0), dim=0) - token.set_embedding(self.name, mean) + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=self.tokenizer, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation=self.pooling_operation, + use_scalar_mix=self.use_scalar_mix, + ) return sentences @@ -994,6 +1418,140 @@ def __str__(self): return self.name +class OpenAIGPT2Embeddings(TokenEmbeddings): + def __init__( + self, + model: str = "gpt2-medium", + layers: str = "1", + pooling_operation: str = "first_last", + use_scalar_mix: bool = False, + ): + """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019. + :param model: name of OpenAI GPT-2 model + :param layers: comma-separated list of layers + :param pooling_operation: defines pooling operation for subwords + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) + """ + super().__init__() + + if model not in OPENAI_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys(): + raise ValueError("Provided OpenAI GPT-2 model is not available.") + + self.tokenizer = GPT2Tokenizer.from_pretrained(model) + self.model = GPT2Model.from_pretrained( + pretrained_model_name_or_path=model, output_hidden_states=True + ) + self.name = model + self.layers: List[int] = [int(layer) for layer in layers.split(",")] + self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix + self.static_embeddings = True + + dummy_sentence: Sentence = Sentence() + dummy_sentence.add_token(Token("hello")) + embedded_dummy = self.embed(dummy_sentence) + self.__embedding_length: int = len( + embedded_dummy[0].get_token(1).get_embedding() + ) + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: + self.model.to(flair.device) + self.model.eval() + + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=self.tokenizer, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation=self.pooling_operation, + use_scalar_mix=self.use_scalar_mix, + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + ) + + return sentences + + +class RoBERTaEmbeddings(TokenEmbeddings): + def __init__( + self, + model: str = "roberta.large", + layers: str = "-1", + pooling_operation: str = "first", + use_scalar_mix: bool = False, + ): + """RoBERTa, as proposed by Liu et al. 2019. + :param model: name of RoBERTa model + :param layers: comma-separated list of layers + :param pooling_operation: defines pooling operation for subwords + :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) + """ + super().__init__() + + try: + self.model = torch.hub.load("pytorch/fairseq", model) + except: + log_line(log) + log.warning("ATTENTION! sacremoses and subword_nmt needs to be installed!") + log_line(log) + pass + + self.name = model + self.layers: List[int] = [int(layer) for layer in layers.split(",")] + self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix + self.static_embeddings = True + + dummy_sentence: Sentence = Sentence() + dummy_sentence.add_token(Token("hello")) + embedded_dummy = self.embed(dummy_sentence) + self.__embedding_length: int = len( + embedded_dummy[0].get_token(1).get_embedding() + ) + + @property + def embedding_length(self) -> int: + return self.__embedding_length + + def __getstate__(self): + # Copy the object's state from self.__dict__ which contains + # all our instance attributes. Always use the dict.copy() + # method to avoid modifying the original state. + state = self.__dict__.copy() + # Remove the unpicklable entries. + state["model"] = None + state["_modules"] = None + + return state + + def __setstate__(self, d): + self.__dict__ = d + # Restore unpickable entries + super().__init__() + self.model = torch.hub.load("pytorch/fairseq", self.name) + + def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: + self.model.to(flair.device) + self.model.eval() + + sentences = _get_transformer_sentence_embeddings( + sentences=sentences, + tokenizer=None, + model=self.model, + name=self.name, + layers=self.layers, + pooling_operation=self.pooling_operation, + use_scalar_mix=self.use_scalar_mix, + ) + + return sentences + + class CharacterEmbeddings(TokenEmbeddings): """Character embeddings of words, as proposed in Lample et al., 2016.""" @@ -1507,6 +2065,7 @@ def __init__( bert_model_or_path: str = "bert-base-uncased", layers: str = "-1,-2,-3,-4", pooling_operation: str = "first", + use_scalar_mix: bool = False, ): """ Bidirectional transformer embeddings of words, as proposed in Devlin et al., 2018. @@ -1519,9 +2078,12 @@ def __init__( super().__init__() self.tokenizer = BertTokenizer.from_pretrained(bert_model_or_path) - self.model = BertModel.from_pretrained(bert_model_or_path) + self.model = BertModel.from_pretrained( + pretrained_model_name_or_path=bert_model_or_path, output_hidden_states=True + ) self.layer_indexes = [int(x) for x in layers.split(",")] self.pooling_operation = pooling_operation + self.use_scalar_mix = use_scalar_mix self.name = str(bert_model_or_path) self.static_embeddings = True @@ -1627,7 +2189,7 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: # put encoded batch through BERT model to get all hidden states of all encoder layers self.model.to(flair.device) self.model.eval() - all_encoder_layers, _ = self.model( + _, _, all_encoder_layers = self.model( all_input_ids, token_type_ids=None, attention_mask=all_input_masks ) @@ -1649,6 +2211,11 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: ) all_layers.append(layer_output[token_index]) + if self.use_scalar_mix: + sm = ScalarMix(mixture_size=len(all_layers), trainable=False) + sm_embeddings = sm(all_layers) + all_layers = sm_embeddings + subtoken_embeddings.append(torch.cat(all_layers)) # get the current sentence object @@ -1680,7 +2247,11 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: @abstractmethod def embedding_length(self) -> int: """Returns the length of the embedding vector.""" - return len(self.layer_indexes) * self.model.config.hidden_size + return ( + len(self.layer_indexes) * self.model.config.hidden_size + if not self.use_scalar_mix + else self.model.config.hidden_size + ) class CharLMEmbeddings(TokenEmbeddings): From d55e7e9a733e8c084f1d74ac5b529b39458b88bd Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 4 Aug 2019 13:42:24 +0200 Subject: [PATCH 3/6] GH-873: add extensive unit tests for all Transformer-based embeddings --- tests/test_transformer_embeddings.py | 860 +++++++++++++++++++++++++++ 1 file changed, 860 insertions(+) create mode 100644 tests/test_transformer_embeddings.py diff --git a/tests/test_transformer_embeddings.py b/tests/test_transformer_embeddings.py new file mode 100644 index 0000000000..6ee51a4db2 --- /dev/null +++ b/tests/test_transformer_embeddings.py @@ -0,0 +1,860 @@ +import flair +import torch +import pytest + +from flair.data import Sentence +from flair.embeddings import ( + RoBERTaEmbeddings, + OpenAIGPTEmbeddings, + OpenAIGPT2Embeddings, + XLNetEmbeddings, + TransformerXLEmbeddings, + XLMEmbeddings, +) + +from pytorch_transformers import ( + OpenAIGPTModel, + OpenAIGPTTokenizer, + GPT2Model, + GPT2Tokenizer, + XLNetModel, + XLNetTokenizer, + TransfoXLModel, + TransfoXLTokenizer, + XLMModel, + XLMTokenizer, +) + +from typing import List + + +def calculate_mean_embedding( + subword_embeddings: List[torch.FloatTensor] +) -> torch.FloatTensor: + all_embeddings: List[torch.FloatTensor] = [ + embedding.unsqueeze(0) for embedding in subword_embeddings + ] + return torch.mean(torch.cat(all_embeddings, dim=0), dim=0) + + +@pytest.mark.integration +def test_roberta_embeddings(): + roberta_model = "roberta.base" + + model = torch.hub.load("pytorch/fairseq", roberta_model) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = model.encode(s) + all_layers = model.extract_features(tokens, return_all_hiddens=True) + first_layer = all_layers[1][0] + + assert len(tokens) == len(first_layer) + + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + # + # 0, 26795, 2614, 8, 10489, 33, 10, 319, 9, 32986, 9306, 254, 7, 192, 479, 2 + # \ / | | | | | | \ | / | | | + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, + pooling_operation, + layers: str = "1", + use_scalar_mix: bool = False, + ) -> Sentence: + embeddings = RoBERTaEmbeddings( + model=roberta_model, + layers=layers, + pooling_operation=pooling_operation, + use_scalar_mix=use_scalar_mix, + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + # First subword embedding + sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() + + puppeteer_first_subword_embedding_ref = first_layer[9].tolist() + puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_subword_embedding_ref + == puppeteer_first_subword_embedding_actual + ) + + # Last subword embedding + sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") + + # First token is splitted into two subwords. + # As we use "last" as pooling operation, we consider the last subword as "first token" here + first_token_embedding_ref = first_layer[2].tolist() + first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() + + puppeteer_last_subword_embedding_ref = first_layer[11].tolist() + puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual + ) + + # First and last subword embedding + sentence_first_last_subword = embed_sentence( + sentence=s, pooling_operation="first_last" + ) + + first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist() + first_token_embedding_actual = sentence_first_last_subword.tokens[ + 0 + ].embedding.tolist() + + puppeteer_first_last_subword_embedding_ref = torch.cat( + [first_layer[9], first_layer[11]] + ).tolist() + puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_last_subword_embedding_ref + == puppeteer_first_last_subword_embedding_actual + ) + + # Mean of all subword embeddings + sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") + + first_token_embedding_ref = calculate_mean_embedding( + [first_layer[1], first_layer[2]] + ).tolist() + first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() + + puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( + [first_layer[9], first_layer[10], first_layer[11]] + ).tolist() + puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual + ) + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence( + sentence="Munich", pooling_operation="first", layers="1,2,3,4" + ) + + ref_embedding_size = 4 * 768 + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", + pooling_operation="first", + layers="1,2,3,4", + use_scalar_mix=True, + ) + + ref_embedding_size = 1 * 768 + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + +@pytest.mark.integration +def test_gpt_embeddings(): + gpt_model: str = "openai-gpt" + + tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model) + model = OpenAIGPTModel.from_pretrained( + pretrained_model_name_or_path=gpt_model, output_hidden_states=True + ) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = tokenizer.tokenize(s) + + indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + + first_layer = hidden_states[1][0] + + assert len(first_layer) == len(tokens) + + # 0 1 2 3 4 5 6 7 8 9 10 11 12 + # + # 'berlin', 'and', 'munich', 'have', 'a', 'lot', 'of', 'pupp', 'ete', 'er', 'to', 'see', '.' + # | | | | | | | \ | / | | | + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, + pooling_operation, + layers: str = "1", + use_scalar_mix: bool = False, + ) -> Sentence: + embeddings = OpenAIGPTEmbeddings( + model=gpt_model, + layers=layers, + pooling_operation=pooling_operation, + use_scalar_mix=use_scalar_mix, + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + # First subword embedding + sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") + + first_token_embedding_ref = first_layer[0].tolist() + first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() + + puppeteer_first_subword_embedding_ref = first_layer[7].tolist() + puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_subword_embedding_ref + == puppeteer_first_subword_embedding_actual + ) + + # Last subword embedding + sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") + + first_token_embedding_ref = first_layer[0].tolist() + first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() + + puppeteer_last_subword_embedding_ref = first_layer[9].tolist() + puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual + ) + + # First and last subword embedding + sentence_first_last_subword = embed_sentence( + sentence=s, pooling_operation="first_last" + ) + + first_token_embedding_ref = torch.cat([first_layer[0], first_layer[0]]).tolist() + first_token_embedding_actual = sentence_first_last_subword.tokens[ + 0 + ].embedding.tolist() + + puppeteer_first_last_subword_embedding_ref = torch.cat( + [first_layer[7], first_layer[9]] + ).tolist() + puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_last_subword_embedding_ref + == puppeteer_first_last_subword_embedding_actual + ) + + # Mean of all subword embeddings + sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") + + first_token_embedding_ref = calculate_mean_embedding([first_layer[0]]).tolist() + first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() + + puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( + [first_layer[7], first_layer[8], first_layer[9]] + ).tolist() + puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual + ) + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence( + sentence="Munich", pooling_operation="first", layers="1,2,3,4" + ) + + ref_embedding_size = 4 * 768 + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", + pooling_operation="first", + layers="1,2,3,4", + use_scalar_mix=True, + ) + + ref_embedding_size = 1 * 768 + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + +@pytest.mark.integration +def test_gpt2_embeddings(): + gpt_model: str = "gpt2-medium" + + tokenizer = GPT2Tokenizer.from_pretrained(gpt_model) + model = GPT2Model.from_pretrained( + pretrained_model_name_or_path=gpt_model, output_hidden_states=True + ) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = tokenizer.tokenize("<|endoftext|>" + s + "<|endoftext|>") + + indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + + first_layer = hidden_states[1][0] + + assert len(first_layer) == len(tokens) + + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + # + # '<|endoftext|>', 'Ber', 'lin', 'Ġand', 'ĠMunich', 'Ġhave', 'Ġa', 'Ġlot', 'Ġof', 'Ġpupp', 'ete', 'er', 'Ġto', 'Ġsee', 'Ġ.', '<|endoftext|>' + # \ / | | | | | | \ | / | | | + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, + pooling_operation, + layers: str = "1", + use_scalar_mix: bool = False, + ) -> Sentence: + embeddings = OpenAIGPT2Embeddings( + model=gpt_model, + layers=layers, + pooling_operation=pooling_operation, + use_scalar_mix=use_scalar_mix, + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + # First subword embedding + sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() + + puppeteer_first_subword_embedding_ref = first_layer[9].tolist() + puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_subword_embedding_ref + == puppeteer_first_subword_embedding_actual + ) + + # Last subword embedding + sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") + + # First token is splitted into two subwords. + # As we use "last" as pooling operation, we consider the last subword as "first token" here + first_token_embedding_ref = first_layer[2].tolist() + first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() + + puppeteer_last_subword_embedding_ref = first_layer[11].tolist() + puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual + ) + + # First and last subword embedding + sentence_first_last_subword = embed_sentence( + sentence=s, pooling_operation="first_last" + ) + + first_token_embedding_ref = torch.cat([first_layer[1], first_layer[2]]).tolist() + first_token_embedding_actual = sentence_first_last_subword.tokens[ + 0 + ].embedding.tolist() + + puppeteer_first_last_subword_embedding_ref = torch.cat( + [first_layer[9], first_layer[11]] + ).tolist() + puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_last_subword_embedding_ref + == puppeteer_first_last_subword_embedding_actual + ) + + # Mean of all subword embeddings + sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") + + first_token_embedding_ref = calculate_mean_embedding( + [first_layer[1], first_layer[2]] + ).tolist() + first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() + + puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( + [first_layer[9], first_layer[10], first_layer[11]] + ).tolist() + puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual + ) + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence( + sentence="Munich", pooling_operation="first", layers="1,2,3,4" + ) + + ref_embedding_size = 4 * 1024 + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", + pooling_operation="first", + layers="1,2,3,4", + use_scalar_mix=True, + ) + + ref_embedding_size = 1 * 1024 + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + +@pytest.mark.integration +def test_xlnet_embeddings(): + xlnet_model: str = "xlnet-large-cased" + + tokenizer = XLNetTokenizer.from_pretrained(xlnet_model) + model = XLNetModel.from_pretrained( + pretrained_model_name_or_path=xlnet_model, output_hidden_states=True + ) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = tokenizer.tokenize("" + s + "") + + print(tokens) + + indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + + first_layer = hidden_states[1][0] + + assert len(first_layer) == len(tokens) + + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + # + # '', '▁Berlin', '▁and', '▁Munich', '▁have', '▁a', '▁lot', '▁of', '▁puppet', 'eer', '▁to', '▁see', '▁', '.', '' + # | | | | | | | \ / | | \ / + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, + pooling_operation, + layers: str = "1", + use_scalar_mix: bool = False, + ) -> Sentence: + embeddings = XLNetEmbeddings( + model=xlnet_model, + layers=layers, + pooling_operation=pooling_operation, + use_scalar_mix=use_scalar_mix, + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + # First subword embedding + sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() + + puppeteer_first_subword_embedding_ref = first_layer[8].tolist() + puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_subword_embedding_ref + == puppeteer_first_subword_embedding_actual + ) + + # Last subword embedding + sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() + + puppeteer_last_subword_embedding_ref = first_layer[9].tolist() + puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual + ) + + # First and last subword embedding + sentence_first_last_subword = embed_sentence( + sentence=s, pooling_operation="first_last" + ) + + first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() + first_token_embedding_actual = sentence_first_last_subword.tokens[ + 0 + ].embedding.tolist() + + puppeteer_first_last_subword_embedding_ref = torch.cat( + [first_layer[8], first_layer[9]] + ).tolist() + puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_last_subword_embedding_ref + == puppeteer_first_last_subword_embedding_actual + ) + + # Mean of all subword embeddings + sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") + + first_token_embedding_ref = calculate_mean_embedding([first_layer[1]]).tolist() + first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() + + puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( + [first_layer[8], first_layer[9]] + ).tolist() + puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual + ) + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence( + sentence="Munich", pooling_operation="first", layers="1,2,3,4" + ) + + ref_embedding_size = 4 * model.d_model + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", + pooling_operation="first", + layers="1,2,3,4", + use_scalar_mix=True, + ) + + ref_embedding_size = 1 * model.d_model + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + +@pytest.mark.integration +def test_transformer_xl_embeddings(): + transfo_model: str = "transfo-xl-wt103" + + tokenizer = TransfoXLTokenizer.from_pretrained(transfo_model) + model = TransfoXLModel.from_pretrained( + pretrained_model_name_or_path=transfo_model, output_hidden_states=True + ) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = tokenizer.tokenize(s + "") + + print(tokens) + + indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + + first_layer = hidden_states[1][0] + + assert len(first_layer) == len(tokens) + + # 0 1 2 3 4 5 6 7 8 9 10 11 + # + # 'Berlin', 'and', 'Munich', 'have', 'a', 'lot', 'of', 'puppeteer', 'to', 'see', '.', '' + # | | | | | | | | | | | + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, layers: str = "1", use_scalar_mix: bool = False + ) -> Sentence: + embeddings = TransformerXLEmbeddings( + model=transfo_model, layers=layers, use_scalar_mix=use_scalar_mix + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + sentence = embed_sentence(sentence=s) + + first_token_embedding_ref = first_layer[0].tolist() + first_token_embedding_actual = sentence.tokens[0].embedding.tolist() + + puppeteer_embedding_ref = first_layer[7].tolist() + puppeteer_embedding_actual = sentence.tokens[7].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert puppeteer_embedding_ref == puppeteer_embedding_actual + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence(sentence="Munich", layers="1,2,3,4") + + ref_embedding_size = 4 * model.d_embed + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", layers="1,2,3,4", use_scalar_mix=True + ) + + ref_embedding_size = 1 * model.d_embed + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + +@pytest.mark.integration +def test_xlm_embeddings(): + xlm_model: str = "xlm-mlm-en-2048" + + tokenizer = XLMTokenizer.from_pretrained(xlm_model) + model = XLMModel.from_pretrained( + pretrained_model_name_or_path=xlm_model, output_hidden_states=True + ) + model.to(flair.device) + model.eval() + + s: str = "Berlin and Munich have a lot of puppeteer to see ." + + with torch.no_grad(): + tokens = tokenizer.tokenize("" + s + "") + + indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) + tokens_tensor = torch.tensor([indexed_tokens]) + tokens_tensor = tokens_tensor.to(flair.device) + + hidden_states = model(tokens_tensor)[-1] + + first_layer = hidden_states[1][0] + + assert len(first_layer) == len(tokens) + + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 + # + # 'berlin', 'and', 'munich', 'have', 'a', 'lot', 'of', 'pupp', 'ete', 'er', 'to', 'see', '.', ' + # | | | | | | | \ | / | | | + # Berlin and Munich have a lot of puppeteer to see . + # + # 0 1 2 3 4 5 6 7 8 9 10 + + def embed_sentence( + sentence: str, + pooling_operation, + layers: str = "1", + use_scalar_mix: bool = False, + ) -> Sentence: + embeddings = XLMEmbeddings( + model=xlm_model, + layers=layers, + pooling_operation=pooling_operation, + use_scalar_mix=use_scalar_mix, + ) + flair_sentence = Sentence(sentence) + embeddings.embed(flair_sentence) + + return flair_sentence + + # First subword embedding + sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_first_subword.tokens[0].embedding.tolist() + + puppeteer_first_subword_embedding_ref = first_layer[8].tolist() + puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_subword_embedding_ref + == puppeteer_first_subword_embedding_actual + ) + + # Last subword embedding + sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") + + first_token_embedding_ref = first_layer[1].tolist() + first_token_embedding_actual = sentence_last_subword.tokens[0].embedding.tolist() + + puppeteer_last_subword_embedding_ref = first_layer[10].tolist() + puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual + ) + + # First and last subword embedding + sentence_first_last_subword = embed_sentence( + sentence=s, pooling_operation="first_last" + ) + + first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() + first_token_embedding_actual = sentence_first_last_subword.tokens[ + 0 + ].embedding.tolist() + + puppeteer_first_last_subword_embedding_ref = torch.cat( + [first_layer[8], first_layer[10]] + ).tolist() + puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_first_last_subword_embedding_ref + == puppeteer_first_last_subword_embedding_actual + ) + + # Mean of all subword embeddings + sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") + + first_token_embedding_ref = calculate_mean_embedding([first_layer[1]]).tolist() + first_token_embedding_actual = sentence_mean_subword.tokens[0].embedding.tolist() + + puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( + [first_layer[8], first_layer[9], first_layer[10]] + ).tolist() + puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ + 7 + ].embedding.tolist() + + assert first_token_embedding_ref == first_token_embedding_actual + assert ( + puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual + ) + + # Check embedding dimension when using multiple layers + sentence_mult_layers = embed_sentence( + sentence="Munich", pooling_operation="first", layers="1,2,3,4" + ) + + ref_embedding_size = 4 * model.embeddings.embedding_dim + actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size + + # Check embedding dimension when using multiple layers and scalar mix + sentence_mult_layers_scalar_mix = embed_sentence( + sentence="Berlin", + pooling_operation="first", + layers="1,2,3,4", + use_scalar_mix=True, + ) + + ref_embedding_size = 1 * model.embeddings.embedding_dim + actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) + + assert ref_embedding_size == actual_embedding_size From d88b6dab843ff5c21234d25373a9590ac951403b Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 4 Aug 2019 22:23:21 +0200 Subject: [PATCH 4/6] =?UTF-8?q?GH-873:=20fix=20scalar=20mix=20calculation?= =?UTF-8?q?=20for=20BERT=20=F0=9F=98=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flair/embeddings.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/flair/embeddings.py b/flair/embeddings.py index 5e5eaec1d2..2a756b76c3 100644 --- a/flair/embeddings.py +++ b/flair/embeddings.py @@ -2204,17 +2204,22 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: for token_index, _ in enumerate(feature.tokens): all_layers = [] for layer_index in self.layer_indexes: - layer_output = ( - all_encoder_layers[int(layer_index)] - .detach() - .cpu()[sentence_index] - ) + if self.use_scalar_mix: + layer_output = all_encoder_layers[int(layer_index)][ + sentence_index + ] + else: + layer_output = ( + all_encoder_layers[int(layer_index)] + .detach() + .cpu()[sentence_index] + ) all_layers.append(layer_output[token_index]) if self.use_scalar_mix: - sm = ScalarMix(mixture_size=len(all_layers), trainable=False) + sm = ScalarMix(mixture_size=len(all_layers)) sm_embeddings = sm(all_layers) - all_layers = sm_embeddings + all_layers = [sm_embeddings] subtoken_embeddings.append(torch.cat(all_layers)) From 03878cb3577a39d5957a3198a0528d15abb86dba Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 5 Aug 2019 17:41:36 +0200 Subject: [PATCH 5/6] GH-873: extensive transformer embeddings tests should be run via pytest --runslow tests --- tests/test_transformer_embeddings.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_transformer_embeddings.py b/tests/test_transformer_embeddings.py index 6ee51a4db2..3df5e84f48 100644 --- a/tests/test_transformer_embeddings.py +++ b/tests/test_transformer_embeddings.py @@ -37,7 +37,7 @@ def calculate_mean_embedding( return torch.mean(torch.cat(all_embeddings, dim=0), dim=0) -@pytest.mark.integration +@pytest.mark.slow def test_roberta_embeddings(): roberta_model = "roberta.base" @@ -181,7 +181,7 @@ def embed_sentence( assert ref_embedding_size == actual_embedding_size -@pytest.mark.integration +@pytest.mark.slow def test_gpt_embeddings(): gpt_model: str = "openai-gpt" @@ -330,7 +330,7 @@ def embed_sentence( assert ref_embedding_size == actual_embedding_size -@pytest.mark.integration +@pytest.mark.slow def test_gpt2_embeddings(): gpt_model: str = "gpt2-medium" @@ -483,7 +483,7 @@ def embed_sentence( assert ref_embedding_size == actual_embedding_size -@pytest.mark.integration +@pytest.mark.slow def test_xlnet_embeddings(): xlnet_model: str = "xlnet-large-cased" @@ -634,7 +634,7 @@ def embed_sentence( assert ref_embedding_size == actual_embedding_size -@pytest.mark.integration +@pytest.mark.slow def test_transformer_xl_embeddings(): transfo_model: str = "transfo-xl-wt103" @@ -711,7 +711,7 @@ def embed_sentence( assert ref_embedding_size == actual_embedding_size -@pytest.mark.integration +@pytest.mark.slow def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" From 1f3f78595964d82309278c50ada7aeae6cf39d30 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 7 Aug 2019 09:27:37 +0200 Subject: [PATCH 6/6] GH-873: fix log_line import and add fastBPE as new dependency (only needed when RoBERTa embeddings are used) --- flair/embeddings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flair/embeddings.py b/flair/embeddings.py index 2a756b76c3..194db632b2 100644 --- a/flair/embeddings.py +++ b/flair/embeddings.py @@ -58,6 +58,7 @@ from .nn import LockedDropout, WordDropout from .data import Dictionary, Token, Sentence from .file_utils import cached_path, open_inside_zip +from .training_utils import log_line log = logging.getLogger("flair") @@ -1497,7 +1498,7 @@ def __init__( self.model = torch.hub.load("pytorch/fairseq", model) except: log_line(log) - log.warning("ATTENTION! sacremoses and subword_nmt needs to be installed!") + log.warning("ATTENTION! fastBPE, sacremoses and subword_nmt needs to be installed!") log_line(log) pass