diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index ff40551da128..3ea0233024bc 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -45,6 +45,10 @@ logger = logging.get_logger(__name__) +TFModelInputType = Union[ + List[tf.Tensor], List[np.ndarray], Dict[str, tf.Tensor], Dict[str, np.ndarray], np.ndarray, tf.Tensor +] + class TFModelUtilsMixin: """ diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 108f55dcf602..8a8a67127063 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -17,7 +17,7 @@ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -82,16 +82,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -101,14 +101,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -122,16 +122,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -141,15 +141,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -163,16 +163,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -182,8 +182,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -218,7 +218,14 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -876,7 +883,7 @@ def call( return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1102,7 +1109,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1205,7 +1212,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1307,7 +1314,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1422,7 +1429,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1569,13 +1576,14 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 01d51ddaa52d..bfd26ff10ef7 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -17,8 +17,9 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf from ...activations_tf import get_tf_activation @@ -44,6 +45,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFNextSentencePredictionLoss, TFPreTrainedModel, @@ -96,27 +98,31 @@ class TFBertPreTrainingLoss: computation. """ - def compute_loss(self, labels, logits): + def compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE ) # make sure only labels that are not equal to -100 # are taken into account as loss - masked_lm_active_loss = tf.not_equal(tf.reshape(labels["labels"], (-1,)), -100) + masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100) masked_lm_reduced_logits = tf.boolean_mask( - tf.reshape(logits[0], (-1, shape_list(logits[0])[2])), - masked_lm_active_loss, + tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])), + mask=masked_lm_active_loss, + ) + masked_lm_labels = tf.boolean_mask( + tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss + ) + next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100) + next_sentence_reduced_logits = tf.boolean_mask( + tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss ) - masked_lm_labels = tf.boolean_mask(tf.reshape(labels["labels"], (-1,)), masked_lm_active_loss) - next_sentence_active_loss = tf.not_equal(tf.reshape(labels["next_sentence_label"], (-1,)), -100) - next_sentence_reduced_logits = tf.boolean_mask(tf.reshape(logits[1], (-1, 2)), next_sentence_active_loss) next_sentence_label = tf.boolean_mask( - tf.reshape(labels["next_sentence_label"], (-1,)), mask=next_sentence_active_loss + tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss ) - masked_lm_loss = loss_fn(masked_lm_labels, masked_lm_reduced_logits) - next_sentence_loss = loss_fn(next_sentence_label, next_sentence_reduced_logits) - masked_lm_loss = tf.reshape(masked_lm_loss, (-1, shape_list(next_sentence_loss)[0])) - masked_lm_loss = tf.reduce_mean(masked_lm_loss, 0) + masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits) + next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits) + masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(next_sentence_loss)[0])) + masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0) return masked_lm_loss + next_sentence_loss @@ -129,16 +135,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -148,14 +154,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -168,16 +174,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -187,15 +193,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -208,16 +214,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -227,8 +233,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -237,7 +243,7 @@ def call(self, position_ids): class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.word_embeddings = TFBertWordEmbeddings( @@ -262,7 +268,14 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -272,18 +285,18 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = self.word_embeddings(input_ids) if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) + position_embeds = self.position_embeddings(inputs_embeds) else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_embeds = self.position_embeddings(position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -292,7 +305,7 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em class TFBertSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -307,50 +320,57 @@ def __init__(self, config, **kwargs): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -359,7 +379,7 @@ def call(self, hidden_states, attention_mask=None, head_mask=None, output_attent class TFBertSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -375,13 +395,13 @@ def __init__(self, config, **kwargs): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -390,7 +410,7 @@ def call(self, hidden_states, input_tensor, training=False): class TFBertAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFBertSelfAttention(config, name="self") @@ -399,34 +419,47 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class TFBertIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) @@ -434,7 +467,7 @@ def call(self, hidden_states): class TFBertOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -447,7 +480,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -456,41 +489,54 @@ def call(self, hidden_states, input_tensor, training=False): class TFBertLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.attention = TFBertAttention(config, name="attention") self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs class TFBertEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -499,7 +545,11 @@ def call( all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -519,31 +569,33 @@ def call( class TFBertPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output class TFBertPredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -553,16 +605,16 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states class TFBertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size @@ -574,28 +626,28 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) - def get_output_embeddings(self): + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -605,27 +657,29 @@ def call(self, hidden_states): class TFBertMLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores class TFBertNSPHead(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: BertConfig, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense( - 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" + units=2, + kernel_initializer=get_initializer(config.initializer_range), + name="seq_relationship", ) - def call(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) + def call(self, pooled_output: tf.Tensor) -> tf.Tensor: + seq_relationship_score = self.seq_relationship(inputs=pooled_output) return seq_relationship_score @@ -634,7 +688,7 @@ def call(self, pooled_output): class TFBertMainLayer(tf.keras.layers.Layer): config_class = BertConfig - def __init__(self, config, add_pooling_layer=True, **kwargs): + def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): super().__init__(**kwargs) self.config = config @@ -643,10 +697,10 @@ def __init__(self, config, add_pooling_layer=True, **kwargs): self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -659,18 +713,18 @@ class PreTrainedModel def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -697,16 +751,16 @@ def call( raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -722,8 +776,8 @@ def call( # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -736,17 +790,17 @@ def call( inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None if not inputs["return_dict"]: return ( @@ -799,8 +853,8 @@ class TFBertForPreTrainingOutput(ModelOutput): loss: Optional[tf.Tensor] = None prediction_logits: tf.Tensor = None seq_relationship_logits: tf.Tensor = None - hidden_states: Optional[Tuple[tf.Tensor]] = None - attentions: Optional[Tuple[tf.Tensor]] = None + hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None + attentions: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None BERT_START_DOCSTRING = r""" @@ -841,7 +895,7 @@ class TFBertForPreTrainingOutput(ModelOutput): BERT_INPUTS_DOCSTRING = r""" Args: - input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.BertTokenizer`. See @@ -849,14 +903,14 @@ class TFBertForPreTrainingOutput(ModelOutput): details. `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: @@ -864,18 +918,18 @@ class TFBertForPreTrainingOutput(ModelOutput): - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`__ - position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ - head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. @@ -898,7 +952,7 @@ class TFBertForPreTrainingOutput(ModelOutput): BERT_START_DOCSTRING, ) class TFBertModel(TFBertPreTrainedModel): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @@ -912,18 +966,18 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -954,7 +1008,7 @@ def call( return outputs - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -977,17 +1031,17 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model _keys_to_ignore_on_load_unexpected = [r"cls.predictions.decoder.weight"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -995,20 +1049,20 @@ def get_prefix_bias_name(self): @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - next_sentence_label=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]: r""" Return: @@ -1042,7 +1096,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1054,8 +1108,8 @@ def call( training=inputs["training"], ) sequence_output, pooled_output = outputs[:2] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - seq_relationship_score = self.nsp(pooled_output) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + seq_relationship_score = self.nsp(pooled_output=pooled_output) total_loss = None if inputs["labels"] is not None and inputs["next_sentence_label"] is not None: @@ -1074,7 +1128,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFBertForPreTrainingOutput) -> TFBertForPreTrainingOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1096,7 +1150,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): r"nsp___cls", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if config.is_decoder: @@ -1106,12 +1160,12 @@ def __init__(self, config, *inputs, **kwargs): ) self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -1124,21 +1178,21 @@ def get_prefix_bias_name(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` @@ -1160,7 +1214,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1172,8 +1226,10 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], prediction_scores) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) if not inputs["return_dict"]: output = (prediction_scores,) + outputs[2:] @@ -1186,7 +1242,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1202,19 +1258,19 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): r"nsp___cls", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if not config.is_decoder: logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.mlm = TFBertMLMHead(config, self.bert.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions - def get_prefix_bias_name(self): + def get_prefix_bias_name(self) -> str: warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name @@ -1226,21 +1282,21 @@ def get_prefix_bias_name(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. """ @@ -1261,7 +1317,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1273,14 +1329,14 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - logits = self.mlm(sequence_output, training=inputs["training"]) + logits = self.mlm(sequence_output=sequence_output, training=inputs["training"]) loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.compute_loss(labels, logits) + loss = self.compute_loss(labels=labels, logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1293,7 +1349,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1308,7 +1364,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"cls.predictions"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") @@ -1318,19 +1374,19 @@ def __init__(self, config, *inputs, **kwargs): @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - next_sentence_label=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + next_sentence_label: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFNextSentencePredictorOutput, Tuple[tf.Tensor]]: r""" Return: @@ -1366,7 +1422,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1378,7 +1434,7 @@ def call( training=inputs["training"], ) pooled_output = outputs[1] - seq_relationship_scores = self.nsp(pooled_output) + seq_relationship_scores = self.nsp(pooled_output=pooled_output) next_sentence_loss = ( None if inputs["next_sentence_label"] is None @@ -1396,7 +1452,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1415,14 +1471,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1434,21 +1493,21 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). @@ -1470,7 +1529,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1482,9 +1541,9 @@ def call( training=inputs["training"], ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=inputs["training"]) - logits = self.classifier(pooled_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"]) + logits = self.classifier(inputs=pooled_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1497,7 +1556,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1516,17 +1575,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property - def dummy_inputs(self): + def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ Dummy inputs to build the network. @@ -1544,21 +1603,21 @@ def dummy_inputs(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) @@ -1587,38 +1646,46 @@ def call( num_choices = shape_list(inputs["inputs_embeds"])[1] seq_length = shape_list(inputs["inputs_embeds"])[2] - flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None + flat_input_ids = ( + tf.reshape(tensor=inputs["input_ids"], shape=(-1, seq_length)) if inputs["input_ids"] is not None else None + ) flat_attention_mask = ( - tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None + tf.reshape(tensor=inputs["attention_mask"], shape=(-1, seq_length)) + if inputs["attention_mask"] is not None + else None ) flat_token_type_ids = ( - tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None + tf.reshape(tensor=inputs["token_type_ids"], shape=(-1, seq_length)) + if inputs["token_type_ids"] is not None + else None ) flat_position_ids = ( - tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None + tf.reshape(tensor=inputs["position_ids"], shape=(-1, seq_length)) + if inputs["position_ids"] is not None + else None ) flat_inputs_embeds = ( - tf.reshape(inputs["inputs_embeds"], (-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) + tf.reshape(tensor=inputs["inputs_embeds"], shape=(-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) if inputs["inputs_embeds"] is not None else None ) outputs = self.bert( - flat_input_ids, - flat_attention_mask, - flat_token_type_ids, - flat_position_ids, - inputs["head_mask"], - flat_inputs_embeds, - inputs["output_attentions"], - inputs["output_hidden_states"], + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=inputs["head_mask"], + inputs_embeds=flat_inputs_embeds, + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], training=inputs["training"], ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=inputs["training"]) - logits = self.classifier(pooled_output) - reshaped_logits = tf.reshape(logits, (-1, num_choices)) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], reshaped_logits) + pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"]) + logits = self.classifier(inputs=pooled_output) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=reshaped_logits) if not inputs["return_dict"]: output = (reshaped_logits,) + outputs[2:] @@ -1640,12 +1707,12 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1670,14 +1737,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ] _keys_to_ignore_on_load_missing = [r"dropout"] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1689,21 +1759,21 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ @@ -1724,7 +1794,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1736,9 +1806,9 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=inputs["training"]) - logits = self.classifier(sequence_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + sequence_output = self.dropout(inputs=sequence_output, training=inputs["training"]) + logits = self.classifier(inputs=sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[2:] @@ -1751,7 +1821,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1775,13 +1845,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) r"cls.seq_relationship", ] - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.qa_outputs = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="qa_outputs", ) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1793,26 +1866,26 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - start_positions=None, - end_positions=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: r""" - start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. @@ -1835,7 +1908,7 @@ def call( kwargs_call=kwargs, ) outputs = self.bert( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1847,16 +1920,16 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = tf.split(logits, 2, axis=-1) - start_logits = tf.squeeze(start_logits, axis=-1) - end_logits = tf.squeeze(end_logits, axis=-1) + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) loss = None if inputs["start_positions"] is not None and inputs["end_positions"] is not None: labels = {"start_position": inputs["start_positions"]} labels["end_position"] = inputs["end_positions"] - loss = self.compute_loss(labels, (start_logits, end_logits)) + loss = self.compute_loss(labels=labels, logits=(start_logits, end_logits)) if not inputs["return_dict"]: output = (start_logits, end_logits) + outputs[2:] @@ -1870,7 +1943,7 @@ def call( attentions=outputs.attentions, ) - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 8b29c7f4bc99..257f67397322 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -916,7 +916,7 @@ def get_seq_element(sequence_position, input_batch): ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 64786f3ed96b..f187a79d3d2a 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -17,6 +17,7 @@ """ import warnings +from typing import Any, Dict import tensorflow as tf @@ -76,16 +77,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -95,14 +96,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -116,16 +117,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -135,8 +136,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -793,7 +794,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -894,7 +895,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -985,7 +986,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1128,7 +1129,7 @@ def serving(self, inputs): return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1235,7 +1236,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index a75c40617009..732089014e09 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union import tensorflow as tf @@ -79,16 +79,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -98,14 +98,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -119,16 +119,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -138,15 +138,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -160,16 +160,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -179,8 +179,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -188,7 +188,7 @@ def call(self, position_ids): # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra class TFElectraSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -203,50 +203,57 @@ def __init__(self, config, **kwargs): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -254,9 +261,9 @@ def call(self, hidden_states, attention_mask=None, head_mask=None, output_attent return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra class TFElectraSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -272,13 +279,13 @@ def __init__(self, config, **kwargs): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -288,7 +295,7 @@ def call(self, hidden_states, input_tensor, training=False): # Copied from from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra class TFElectraAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFElectraSelfAttention(config, name="self") @@ -297,44 +304,57 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra class TFElectraIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra class TFElectraOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -347,7 +367,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -357,20 +377,33 @@ def call(self, hidden_states, input_tensor, training=False): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra class TFElectraLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.attention = TFElectraAttention(config, name="attention") self.intermediate = TFElectraIntermediate(config, name="intermediate") self.bert_output = TFElectraOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs @@ -378,21 +411,21 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra class TFElectraEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFElectraLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -401,7 +434,11 @@ def call( all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -420,27 +457,28 @@ def call( ) -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra class TFElectraPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: ElectraConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra class TFElectraEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -469,8 +507,15 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - # Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings.call with Albert->Electra - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -1094,7 +1139,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1212,7 +1257,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1353,13 +1398,14 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1457,7 +1503,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1572,7 +1618,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index 819b553d3fb4..f79962820801 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -83,16 +83,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -102,14 +102,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -1433,7 +1433,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1523,7 +1523,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1653,13 +1653,13 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) - return self.serving_output(output) + return self.serving_output(output=output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1752,7 +1752,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1857,7 +1857,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index c9f2838fb2e5..dfc18f24f2cd 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -16,7 +16,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -424,16 +424,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -443,14 +443,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -464,16 +464,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -483,15 +483,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -508,7 +508,7 @@ def build(self, input_shape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -527,10 +527,10 @@ def call(self, position_ids): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -638,8 +638,8 @@ def create_position_ids_from_input_ids(self, input_ids): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -689,34 +689,34 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Longformer class TFLongformerIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer class TFLongformerOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -729,7 +729,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -737,23 +737,23 @@ def call(self, hidden_states, input_tensor, training=False): return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer class TFLongformerPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LongformerConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 16b72f2466be..f86a1b2560ab 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -18,7 +18,7 @@ import warnings from dataclasses import dataclass -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -186,16 +186,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -205,14 +205,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -226,16 +226,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -245,15 +245,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -267,16 +267,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -286,8 +286,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -1129,11 +1129,13 @@ def call(self, hidden_states): # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: LxmertConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -1143,17 +1145,17 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert class TFLxmertLMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size @@ -1165,28 +1167,28 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super().build(input_shape) - def get_output_embeddings(self): + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -1197,13 +1199,13 @@ def call(self, hidden_states): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Lxmert class TFLxmertMLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 40351514057d..a0ca1b0addf8 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -17,7 +17,7 @@ import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import tensorflow as tf @@ -116,16 +116,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -135,14 +135,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -156,16 +156,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -175,15 +175,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -197,16 +197,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -216,8 +216,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -1082,7 +1082,7 @@ def call( return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1296,7 +1296,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1410,7 +1410,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForNextSentencePrediction.serving_output - def serving_output(self, output): + def serving_output(self, output: TFNextSentencePredictorOutput) -> TFNextSentencePredictorOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1519,7 +1519,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1640,7 +1640,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1793,13 +1793,14 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1908,7 +1909,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index e1ff0ba7015a..f7b82cf98646 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -18,6 +18,7 @@ import math import warnings +from typing import Any, Dict import tensorflow as tf @@ -95,16 +96,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -114,14 +115,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -139,7 +140,7 @@ def build(self, input_shape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -158,10 +159,10 @@ def call(self, position_ids): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -207,8 +208,8 @@ def create_position_ids_from_input_ids(self, input_ids): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -253,23 +254,23 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=F return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->MPNet class TFMPNetPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output @@ -291,28 +292,28 @@ def __init__(self, config, **kwargs): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="q", ) self.k = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="k", ) self.v = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="v", ) self.o = tf.keras.layers.experimental.EinsumDense( equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="o", ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) @@ -322,8 +323,8 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, posi k = self.k(hidden_states) v = self.v(hidden_states) - dk = tf.cast(x=self.attention_head_size, dtype=q.dtype) - q = tf.multiply(x=q, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=q.dtype) + q = tf.multiply(q, y=tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", k, q) # Apply relative position embedding (precomputed in MPNetEncoder) if provided. @@ -368,34 +369,34 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, posit return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet class TFMPNetIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet class TFMPNetOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: MPNetConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -408,7 +409,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -563,11 +564,11 @@ def __init__(self, config, **kwargs): self.embeddings = TFMPNetEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -817,7 +818,7 @@ def call( return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -970,7 +971,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1092,7 +1093,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1230,7 +1231,7 @@ def serving(self, inputs): return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1330,7 +1331,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1443,7 +1444,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index 6a725e116e26..0d968026d9f0 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -660,7 +660,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -962,7 +962,7 @@ def get_seq_element(sequence_position, input_batch): ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index e8f3e18880f5..5d84fc6cebac 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -16,7 +16,9 @@ """ TF 2.0 RoBERTa model. """ import warnings +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf from ...activations_tf import get_tf_activation @@ -37,6 +39,7 @@ ) from ...modeling_tf_utils import ( TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -74,16 +77,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -93,14 +96,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -114,16 +117,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -133,15 +136,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -159,7 +162,7 @@ def build(self, input_shape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) @@ -178,10 +181,10 @@ def call(self, position_ids): flat_position_ids = tf.reshape(tensor=position_ids, shape=[-1]) embeddings = tf.gather(params=self.position_embeddings, indices=flat_position_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=position_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(position_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=position_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(position_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -235,8 +238,8 @@ def create_position_ids_from_input_ids(self, input_ids): tensor=input_ids, shape=(input_ids_shape[0] * input_ids_shape[1], input_ids_shape[2]) ) - mask = tf.cast(x=tf.math.not_equal(x=input_ids, y=self.padding_idx), dtype=input_ids.dtype) - incremental_indices = tf.math.cumsum(x=mask, axis=1) * mask + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = tf.math.cumsum(mask, axis=1) * mask return incremental_indices + self.padding_idx @@ -286,30 +289,30 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em return final_embeddings -# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Roberta class TFRobertaPooler(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation="tanh", name="dense", ) - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) + pooled_output = self.dense(inputs=first_token_tensor) return pooled_output # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta class TFRobertaSelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -324,50 +327,57 @@ def __init__(self, config, **kwargs): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -375,9 +385,9 @@ def call(self, hidden_states, attention_mask=None, head_mask=None, output_attent return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta class TFRobertaSelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -393,13 +403,13 @@ def __init__(self, config, **kwargs): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -409,7 +419,7 @@ def call(self, hidden_states, input_tensor, training=False): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta class TFRobertaAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.self_attention = TFRobertaSelfAttention(config, name="self") @@ -418,44 +428,57 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta class TFRobertaIntermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states -# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta class TFRobertaOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -468,7 +491,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -478,20 +501,33 @@ def call(self, hidden_states, input_tensor, training=False): # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta class TFRobertaLayer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.attention = TFRobertaAttention(config, name="attention") self.intermediate = TFRobertaIntermediate(config, name="intermediate") self.bert_output = TFRobertaOutput(config, name="output") - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs @@ -499,21 +535,21 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta class TFRobertaEncoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: RobertaConfig, **kwargs): super().__init__(**kwargs) self.layer = [TFRobertaLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -522,7 +558,11 @@ def call( all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -560,11 +600,11 @@ def __init__(self, config, add_pooling_layer=True, **kwargs): self.embeddings = TFRobertaEmbeddings(config, name="embeddings") # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings - def get_input_embeddings(self): + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings - def set_input_embeddings(self, value): + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] @@ -579,18 +619,18 @@ class PreTrainedModel # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -610,23 +650,23 @@ def call( if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif inputs["input_ids"] is not None: - input_shape = shape_list(inputs["input_ids"]) + input_shape = shape_list(tensor=inputs["input_ids"]) elif inputs["inputs_embeds"] is not None: - input_shape = shape_list(inputs["inputs_embeds"])[:-1] + input_shape = shape_list(tensor=inputs["inputs_embeds"])[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -642,8 +682,8 @@ def call( # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -653,21 +693,20 @@ def call( if inputs["head_mask"] is not None: raise NotImplementedError else: - inputs["head_mask"] = [None] * self.num_hidden_layers - # head_mask = tf.constant([0] * self.num_hidden_layers) + inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None if not inputs["return_dict"]: return ( @@ -857,7 +896,7 @@ def call( return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1013,7 +1052,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1136,7 +1175,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1280,7 +1319,7 @@ def serving(self, inputs): return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1383,7 +1422,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1498,7 +1537,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 8cd3c7ef4814..203de846a904 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -19,7 +19,7 @@ import itertools import warnings from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Dict, Optional, Tuple import numpy as np import tensorflow as tf @@ -1016,7 +1016,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1177,13 +1177,14 @@ def call( } ] ) - def serving(self, inputs): - output = self.call(inputs) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1291,7 +1292,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1410,7 +1411,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 49c31a5c3b89..001e43278069 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -16,11 +16,10 @@ {% if cookiecutter.is_encoder_decoder_model == "False" %} +from typing import Any, Dict, Optional, Tuple, Union +import numpy as np import tensorflow as tf -from tensorflow.keras import layers - -from transformers.modeling_tf_outputs import TFCausalLMOutput from ...activations_tf import get_tf_activation from ...file_utils import ( @@ -32,6 +31,7 @@ from ...modeling_tf_outputs import ( TFBaseModelOutput, TFBaseModelOutputWithPooling, + TFCausalLMOutput, TFMaskedLMOutput, TFMultipleChoiceModelOutput, TFQuestionAnsweringModelOutput, @@ -41,6 +41,7 @@ from ...modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFMaskedLanguageModelingLoss, + TFModelInputType, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -76,16 +77,16 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: float, self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.weight = self.add_weight( name="weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, @@ -95,14 +96,14 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, input_ids): + def call(self, input_ids: tf.Tensor) -> tf.Tensor: flat_input_ids = tf.reshape(tensor=input_ids, shape=[-1]) embeddings = tf.gather(params=self.weight, indices=flat_input_ids) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=input_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(input_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=input_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(input_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -116,16 +117,16 @@ def __init__(self, type_vocab_size: int, hidden_size: int, initializer_range: fl self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.token_type_embeddings = self.add_weight( name="embeddings", shape=[self.type_vocab_size, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) - super().build(input_shape=input_shape) + super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "type_vocab_size": self.type_vocab_size, "hidden_size": self.hidden_size, @@ -135,15 +136,15 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, token_type_ids): + def call(self, token_type_ids: tf.Tensor) -> tf.Tensor: flat_token_type_ids = tf.reshape(tensor=token_type_ids, shape=[-1]) one_hot_data = tf.one_hot(indices=flat_token_type_ids, depth=self.type_vocab_size, dtype=self._compute_dtype) embeddings = tf.matmul(a=one_hot_data, b=self.token_type_embeddings) embeddings = tf.reshape( - tensor=embeddings, shape=tf.concat(values=[shape_list(tensor=token_type_ids), [self.hidden_size]], axis=0) + tensor=embeddings, shape=tf.concat(values=[shape_list(token_type_ids), [self.hidden_size]], axis=0) ) - embeddings.set_shape(shape=token_type_ids.shape.as_list() + [self.hidden_size]) + embeddings.set_shape(token_type_ids.shape.as_list() + [self.hidden_size]) return embeddings @@ -157,16 +158,16 @@ def __init__(self, max_position_embeddings: int, hidden_size: int, initializer_r self.hidden_size = hidden_size self.initializer_range = initializer_range - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.position_embeddings = self.add_weight( name="embeddings", shape=[self.max_position_embeddings, self.hidden_size], - initializer=get_initializer(initializer_range=self.initializer_range), + initializer=get_initializer(self.initializer_range), ) super().build(input_shape) - def get_config(self): + def get_config(self) -> Dict[str, Any]: config = { "max_position_embeddings": self.max_position_embeddings, "hidden_size": self.hidden_size, @@ -176,8 +177,8 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) - def call(self, position_ids): - input_shape = shape_list(tensor=position_ids) + def call(self, position_ids: tf.Tensor) -> tf.Tensor: + input_shape = shape_list(position_ids) position_embeddings = self.position_embeddings[: input_shape[1], :] return tf.broadcast_to(input=position_embeddings, shape=input_shape) @@ -187,7 +188,7 @@ def call(self, position_ids): class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.word_embeddings = TF{{cookiecutter.camelcase_modelname}}WordEmbeddings( @@ -212,7 +213,14 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): + def call( + self, + input_ids: tf.Tensor, + position_ids: tf.Tensor, + token_type_ids: tf.Tensor, + inputs_embeds: tf.Tensor, + training: bool = False, + ) -> tf.Tensor: """ Applies embedding based on inputs tensor. @@ -222,18 +230,18 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: - inputs_embeds = self.word_embeddings(input_ids=input_ids) + inputs_embeds = self.word_embeddings(input_ids) if token_type_ids is None: - input_shape = shape_list(tensor=inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] token_type_ids = tf.fill(dims=input_shape, value=0) if position_ids is None: - position_embeds = self.position_embeddings(position_ids=inputs_embeds) + position_embeds = self.position_embeddings(inputs_embeds) else: - position_embeds = self.position_embeddings(position_ids=position_ids) + position_embeds = self.position_embeddings(position_ids) - token_type_embeds = self.token_type_embeddings(token_type_ids=token_type_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids) final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) @@ -244,7 +252,7 @@ def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_em # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -259,50 +267,57 @@ def __init__(self, config, **kwargs): equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="query", ) self.key = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="key", ) self.value = tf.keras.layers.experimental.EinsumDense( equation="abc,cde->abde", output_shape=(None, config.num_attention_heads, self.attention_head_size), bias_axes="de", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="value", ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: query_layer = self.query(inputs=hidden_states) key_layer = self.key(inputs=hidden_states) value_layer = self.value(inputs=hidden_states) # Take the dot product between "query" and "key" to get the raw # attention scores. - dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) - query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + dk = tf.cast(self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(query_layer, tf.math.rsqrt(dk)) attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) - attention_scores = attention_scores + attention_mask + attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs, training=training) + attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: - attention_scores = attention_scores * head_mask + attention_scores = tf.multiply(attention_scores, head_mask) attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) @@ -310,9 +325,9 @@ def call(self, hidden_states, attention_mask=None, head_mask=None, output_attent return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: @@ -328,13 +343,13 @@ def __init__(self, config, **kwargs): equation="abcd,cde->abe", output_shape=(None, self.all_head_size), bias_axes="e", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -344,7 +359,7 @@ def call(self, hidden_states, input_tensor, training=False): # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.self_attention = TF{{cookiecutter.camelcase_modelname}}SelfAttention(config, name="self") @@ -353,11 +368,24 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: self_outputs = self.self_attention( - input_tensor, attention_mask, head_mask, output_attentions, training=training + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training ) - attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -365,23 +393,23 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, train # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( equation="abc,cd->abd", output_shape=(None, config.intermediate_size), bias_axes="d", - kernel_initializer=get_initializer(initializer_range=config.initializer_range), + kernel_initializer=get_initializer(config.initializer_range), name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) + self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) @@ -390,7 +418,7 @@ def call(self, hidden_states): # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.experimental.EinsumDense( @@ -403,7 +431,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def call(self, hidden_states, input_tensor, training=False): + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) @@ -412,42 +440,52 @@ def call(self, hidden_states, input_tensor, training=False): class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.attention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="attention") self.intermediate = TF{{cookiecutter.camelcase_modelname}}Intermediate(config, name="intermediate") - self.{{cookiecutter.lowercase_modelname}}_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output") + self.bert_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output") - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer.call with bert->{{cookiecutter.lowercase_modelname}} - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: attention_outputs = self.attention( - hidden_states, attention_mask, head_mask, output_attentions, training=training + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, ) attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.{{cookiecutter.lowercase_modelname}}_output(intermediate_output, attention_output, training=training) + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output(hidden_states=intermediate_output, input_tensor=attention_output, training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.layer = [TF{{cookiecutter.camelcase_modelname}}Layer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call( self, - hidden_states, - attention_mask, - head_mask, - output_attentions, - output_hidden_states, - return_dict, - training=False, - ): + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -456,7 +494,11 @@ def call( all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], output_attentions, training=training + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + training=training, ) hidden_states = layer_outputs[0] @@ -477,11 +519,13 @@ def call( # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer): - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): @@ -491,50 +535,50 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def call(self, hidden_states): - hidden_states = self.dense(hidden_states) + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size - + self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) - - def get_output_embeddings(self): + super().build(input_shape=input_shape) + + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings - def set_output_embeddings(self, value): + def set_output_embeddings(self, value: tf.Variable): self.input_embeddings.weight = value self.input_embeddings.vocab_size = shape_list(value)[0] - def get_bias(self): + def get_bias(self) -> Dict[str, tf.Variable]: return {"bias": self.bias} - def set_bias(self, value): + def set_bias(self, value: tf.Variable): self.bias = value["bias"] self.vocab_size = shape_list(value["bias"])[0] - def call(self, hidden_states): + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.transform(hidden_states=hidden_states) - seq_length = shape_list(tensor=hidden_states)[1] + seq_length = shape_list(hidden_states)[1] hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) @@ -545,13 +589,13 @@ def call(self, hidden_states): # Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}} class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): - def __init__(self, config, input_embeddings, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, input_embeddings: tf.keras.layers.Layer, **kwargs): super().__init__(**kwargs) self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions") - def call(self, sequence_output): - prediction_scores = self.predictions(sequence_output) + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores @@ -560,47 +604,45 @@ def call(self, sequence_output): class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): config_class = {{cookiecutter.camelcase_modelname}}Config - def __init__(self, config, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, add_pooling_layer: bool = True, **kwargs): super().__init__(**kwargs) self.config = config - self.num_hidden_layers = config.num_hidden_layers - self.initializer_range = config.initializer_range - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.return_dict = config.use_return_dict + self.embeddings = TF{{cookiecutter.camelcase_modelname}}Embeddings(config, name="embeddings") self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder") - self.config = config - def get_input_embeddings(self): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings - def set_input_embeddings(self, value): + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings + def set_input_embeddings(self, value: tf.Variable): self.embeddings.word_embeddings.weight = value self.embeddings.word_embeddings.vocab_size = shape_list(value)[0] + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads def _prune_heads(self, heads_to_prune): - """Prunes heads of the model. - heads_to_prune: dict of {layer_num: list of heads to prune in this layer} - See base class PreTrainedModel + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel """ raise NotImplementedError def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, **kwargs, - ): + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -616,7 +658,7 @@ def call( training=training, kwargs_call=kwargs, ) - + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif inputs["input_ids"] is not None: @@ -627,16 +669,16 @@ def call( raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs["attention_mask"] is None: - inputs["attention_mask"] = tf.fill(input_shape, 1) + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) if inputs["token_type_ids"] is None: - inputs["token_type_ids"] = tf.fill(input_shape, 0) + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) embedding_output = self.embeddings( - inputs["input_ids"], - inputs["position_ids"], - inputs["token_type_ids"], - inputs["inputs_embeds"], + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], training=inputs["training"], ) @@ -652,8 +694,8 @@ def call( # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(1.0, extended_attention_mask), -10000.0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -663,21 +705,21 @@ def call( if inputs["head_mask"] is not None: raise NotImplementedError else: - inputs["head_mask"] = [None] * self.num_hidden_layers + inputs["head_mask"] = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( - embedding_output, - extended_attention_mask, - inputs["head_mask"], - inputs["output_attentions"], - inputs["output_hidden_states"], - inputs["return_dict"], + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = encoder_outputs[0] - if not return_dict: + if not inputs["return_dict"]: return ( sequence_output, ) + encoder_outputs[1:] @@ -736,43 +778,41 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" Args: - input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. - Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. - See :func:`transformers.PreTrainedTokenizer.__call__` and - :func:`transformers.PreTrainedTokenizer.encode` for details. + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for + details. `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - - 0 for tokens that are **maked**. + - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: - 0 corresponds to a `sentence A` token, - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`__ - position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`__ - head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): - Mask to nullify selected heads of the self-attention modules. - Mask values selected in ``[0, 1]``: + head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. @@ -795,7 +835,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") @@ -809,18 +849,18 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: inputs = input_processing( func=self.call, config=self.config, @@ -852,17 +892,19 @@ def call( return outputs # Copied from transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None - return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns) + return TFBaseModelOutput( + last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns, + ) @add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING) class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMaskedLanguageModelingLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if config.is_decoder: @@ -872,9 +914,9 @@ def __init__(self, config, *inputs, **kwargs): ) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, inputs_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -886,25 +928,24 @@ def get_lm_head(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ inputs = input_processing( func=self.call, @@ -923,7 +964,7 @@ def call( kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -934,13 +975,14 @@ def call( return_dict=inputs["return_dict"], training=inputs["training"], ) - sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], prediction_scores) + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) if not inputs["return_dict"]: - output = (prediction_scores,) + outputs[1:] + output = (prediction_scores,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFMaskedLMOutput( @@ -951,7 +993,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMaskedLM.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -962,16 +1004,16 @@ def serving_output(self, output): ) class TF{{cookiecutter.camelcase_modelname}}ForCausalLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFCausalLanguageModelingLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) if not config.is_decoder: logger.warning("If you want to use `TF{{cookiecutter.camelcase_modelname}}ForCausalLM` as a standalone, add `is_decoder=True.`") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, inputs_embeddings=self.{{cookiecutter.lowercase_modelname}}.embeddings.word_embeddings, name="mlm___cls") - def get_lm_head(self): + def get_lm_head(self) -> tf.keras.layers.Layer: return self.mlm.predictions @add_code_sample_docstrings( @@ -982,21 +1024,21 @@ def get_lm_head(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. """ @@ -1017,7 +1059,7 @@ def call( kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1029,17 +1071,17 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - logits = self.mlm(sequence_output, training=inputs["training"]) + logits = self.mlm(sequence_output=sequence_output, training=inputs["training"]) loss = None if inputs["labels"] is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = inputs["labels"][:, 1:] - loss = self.compute_loss(labels, logits) + loss = self.compute_loss(labels=labels, logits=logits) if not inputs["return_dict"]: - output = (logits,) + outputs[1:] + output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFCausalLMOutput( @@ -1050,37 +1092,41 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.serving_output - def serving_output(self, output): + def serving_output(self, output: TFCausalLMOutput) -> TFCausalLMOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None return TFCausalLMOutput(logits=output.logits, hidden_states=hs, attentions=attns) + class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" - def __init__(self, config, **kwargs): - super().__init__(**kwargs) + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.out_proj = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config + if isinstance(config.hidden_act, str): + self.classifier_act_fn = get_tf_activation(config.hidden_act) + else: + self.classifier_act_fn = config.hidden_act - def call(self, inputs, **kwargs): - x = inputs[:, 0, :] # take token (equiv. to [CLS]) - x = self.dropout(x) - x = self.dense(x) - x = get_tf_activation(self.config.hidden_act)(x) - x = self.dropout(x) - x = self.out_proj(x) + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.classifier_act_fn(hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.out_proj(hidden_states) - return x + return hidden_states @add_start_docstrings( @@ -1089,9 +1135,11 @@ def call(self, inputs, **kwargs): {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFSequenceClassificationLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.classifier = TF{{cookiecutter.camelcase_modelname}}ClassificationHead(config, name="classifier") @@ -1104,24 +1152,23 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. - Indices should be in :obj:`[0, ..., config.num_labels - 1]`. - If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ inputs = input_processing( @@ -1141,7 +1188,7 @@ def call( kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1152,8 +1199,8 @@ def call( return_dict=inputs["return_dict"], training=inputs["training"], ) - logits = self.classifier(outputs[0], training=inputs["training"]) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + logits = self.classifier(hidden_states=outputs[0], training=inputs["training"]) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[1:] @@ -1168,7 +1215,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1181,26 +1228,26 @@ def serving_output(self, output): {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMultipleChoiceLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.sequence_summary = TFSequenceSummary( - config, initializer_range=config.initializer_range, name="sequence_summary" + config, config.initializer_range, name="sequence_summary" ) self.classifier = tf.keras.layers.Dense( - 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property - def dummy_inputs(self): + def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ - return {"input_ids": tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS)} + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( @@ -1211,24 +1258,24 @@ def dummy_inputs(self): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension - of the input tensors. (See :obj:`input_ids` above) + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) """ inputs = input_processing( func=self.call, @@ -1246,7 +1293,7 @@ def call( training=training, kwargs_call=kwargs, ) - + if inputs["input_ids"] is not None: num_choices = shape_list(inputs["input_ids"])[1] seq_length = shape_list(inputs["input_ids"])[2] @@ -1254,37 +1301,47 @@ def call( num_choices = shape_list(inputs["inputs_embeds"])[1] seq_length = shape_list(inputs["inputs_embeds"])[2] - flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None + flat_input_ids = ( + tf.reshape(tensor=inputs["input_ids"], shape=(-1, seq_length)) if inputs["input_ids"] is not None else None + ) flat_attention_mask = ( - tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None + tf.reshape(tensor=inputs["attention_mask"], shape=(-1, seq_length)) + if inputs["attention_mask"] is not None + else None ) flat_token_type_ids = ( - tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None + tf.reshape(tensor=inputs["token_type_ids"], shape=(-1, seq_length)) + if inputs["token_type_ids"] is not None + else None ) flat_position_ids = ( - tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None + tf.reshape(tensor=inputs["position_ids"], shape=(-1, seq_length)) + if inputs["position_ids"] is not None + else None ) flat_inputs_embeds = ( - tf.reshape(inputs["inputs_embeds"], (-1, seq_length, shape_list(inputs["inputs_embeds"])[3])) + tf.reshape( + tensor=inputs["inputs_embeds"], shape=(-1, seq_length, shape_list(inputs["inputs_embeds"])[3]) + ) if inputs["inputs_embeds"] is not None else None ) outputs = self.{{cookiecutter.lowercase_modelname}}( - flat_input_ids, - flat_attention_mask, - flat_token_type_ids, - flat_position_ids, - inputs["head_mask"], - flat_inputs_embeds, - inputs["output_attentions"], - inputs["output_hidden_states"], + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=inputs["head_mask"], + inputs_embeds=flat_inputs_embeds, + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], training=inputs["training"], ) - logits = self.sequence_summary(outputs[0], training=inputs["training"]) - logits = self.classifier(logits) - reshaped_logits = tf.reshape(logits, (-1, num_choices)) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], reshaped_logits) + logits = self.sequence_summary(inputs=outputs[0], training=inputs["training"]) + logits = self.classifier(inputs=logits) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=reshaped_logits) if not inputs["return_dict"]: output = (reshaped_logits,) + outputs[1:] @@ -1303,13 +1360,14 @@ def call( "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), }]) - def serving(self, inputs): - output = self.call(inputs) - + # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving + def serving(self, inputs: Dict[str, tf.Tensor]): + output = self.call(input_ids=inputs) + return self.serving_output(output) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving_output - def serving_output(self, output): + def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1323,14 +1381,15 @@ def serving_output(self, output): ) class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFTokenClassificationLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1342,23 +1401,23 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: r""" - labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. """ inputs = input_processing( func=self.call, @@ -1377,7 +1436,7 @@ def call( kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1389,9 +1448,9 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=inputs["training"]) - logits = self.classifier(sequence_output) - loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], logits) + sequence_output = self.dropout(inputs=sequence_output, training=inputs["training"]) + logits = self.classifier(inputs=sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) if not inputs["return_dict"]: output = (logits,) + outputs[1:] @@ -1405,7 +1464,7 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification.serving_output - def serving_output(self, output): + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -1419,13 +1478,14 @@ def serving_output(self, output): ) class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFQuestionAnsweringLoss): - def __init__(self, config, *inputs, **kwargs): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.qa_outputs = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1437,29 +1497,29 @@ def __init__(self, config, *inputs, **kwargs): ) def call( self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - start_positions=None, - end_positions=None, - training=False, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, **kwargs, - ): + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: r""" - start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. """ inputs = input_processing( func=self.call, @@ -1479,7 +1539,7 @@ def call( kwargs_call=kwargs, ) outputs = self.{{cookiecutter.lowercase_modelname}}( - inputs["input_ids"], + input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], @@ -1491,19 +1551,19 @@ def call( training=inputs["training"], ) sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = tf.split(logits, 2, axis=-1) - start_logits = tf.squeeze(start_logits, axis=-1) - end_logits = tf.squeeze(end_logits, axis=-1) + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) loss = None if inputs["start_positions"] is not None and inputs["end_positions"] is not None: labels = {"start_position": inputs["start_positions"]} labels["end_position"] = inputs["end_positions"] - loss = self.compute_loss(labels, (start_logits, end_logits)) + loss = self.compute_loss(labels=labels, logits=(start_logits, end_logits)) if not inputs["return_dict"]: - output = (start_logits, end_logits) + outputs[1:] + output = (start_logits, end_logits) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFQuestionAnsweringModelOutput( @@ -1515,10 +1575,10 @@ def call( ) # Copied from transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering.serving_output - def serving_output(self, output): + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None - + return TFQuestionAnsweringModelOutput( start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns )