From f8d7f457ad9d22305bc54fc1cdcd71878d643e7b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Mon, 1 Feb 2021 13:11:06 -0500 Subject: [PATCH 1/2] Initial work --- docs/source/main_classes/tokenizer.rst | 4 ++++ src/transformers/models/dpr/modeling_dpr.py | 8 +++++--- src/transformers/models/t5/modeling_t5.py | 2 ++ src/transformers/models/t5/modeling_tf_t5.py | 4 +++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst index fd82e8f97c98..a676b6081d70 100644 --- a/docs/source/main_classes/tokenizer.rst +++ b/docs/source/main_classes/tokenizer.rst @@ -56,6 +56,8 @@ PreTrainedTokenizer :special-members: __call__ :members: + .. automethod:: encode + PreTrainedTokenizerFast ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,6 +66,8 @@ PreTrainedTokenizerFast :special-members: __call__ :members: + .. automethod:: encode + BatchEncoding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 0bc032baf473..5b855bed075c 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -364,9 +364,11 @@ def init_weights(self): Indices can be obtained using :class:`~transformers.DPRTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for - details. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, - `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, - 1]``: + details. + + `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape + :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token + indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index bd05cf00d11d..d0f5e5d1a7a4 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1041,6 +1041,8 @@ def forward( :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for detail. + `What are input IDs? <../glossary.html#input-ids>`__ + To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training <./t5.html#training>`__. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 9e6b16bfc10c..db58a10af469 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -929,7 +929,7 @@ def _shift_right(self, input_ids): T5_INPUTS_DOCSTRING = r""" Args: - inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you should be able to pad the inputs on the right or the left. @@ -937,6 +937,8 @@ def _shift_right(self, input_ids): :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for details. + `What are input IDs? <../glossary.html#input-ids>`__ + To know more on how to prepare :obj:`inputs` for pretraining take a look at `T5 Training <./t5.html#training>`__. decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): From f33bb4f5d9cb074808b0fe4bfabc72d222e47265 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Mon, 1 Feb 2021 16:21:54 -0500 Subject: [PATCH 2/2] Fix doc styler and other models --- src/transformers/models/dpr/modeling_dpr.py | 35 +++++++++++-------- .../models/dpr/modeling_tf_dpr.py | 12 ++++--- src/transformers/models/rag/modeling_rag.py | 2 ++ utils/style_doc.py | 21 ++++++++++- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 5b855bed075c..cb98c8fa81a0 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -366,28 +366,33 @@ def init_weights(self): :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. - `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape - :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token - indices. Mask values selected in ``[0, 1]``: + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. - `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of - shape :obj:`(batch_size, sequence_length)`, `optional`): Segment token indices to indicate first and second - portions of the inputs. Indices are selected in ``[0, 1]``: + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: - 0 corresponds to a `sentence A` token, - 1 corresponds to a `sentence B` token. - `What are token type IDs? <../glossary.html#token-type-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of - shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Optionally, instead of passing - :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want - more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal - embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the - attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers. - See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`): + `What are token type IDs? <../glossary.html#token-type-ids>`_ + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @@ -405,6 +410,8 @@ def init_weights(self): Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for more details. + + `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index cc595b85cda3..b060fbb28618 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -486,15 +486,17 @@ def serving(self, inputs): (a) For sequence pairs (for a pair title+text for example): - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + :: - ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 (b) For single sequences (for a question for example): - ``tokens: [CLS] the dog is hairy . [SEP]`` + :: - ``token_type_ids: 0 0 0 0 0 0 0`` + tokens: [CLS] the dog is hairy . [SEP] + token_type_ids: 0 0 0 0 0 0 0 DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. @@ -502,6 +504,8 @@ def serving(self, inputs): Indices can be obtained using :class:`~transformers.DPRTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index fbb9ca330a68..3501720060a4 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -412,6 +412,8 @@ def from_pretrained_question_encoder_generator( Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to obtain the indices. + + `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: diff --git a/utils/style_doc.py b/utils/style_doc.py index 45ec776e7655..57179e6347e9 100644 --- a/utils/style_doc.py +++ b/utils/style_doc.py @@ -135,6 +135,14 @@ def init_in_block(self, text): """ return SpecialBlock.NOT_SPECIAL + def end_of_special_style(self, line): + """ + Sets back the `in_block` attribute to `NOT_SPECIAL`. + + Useful for some docstrings where we may have to go back to `ARG_LIST` instead. + """ + self.in_block = SpecialBlock.NOT_SPECIAL + def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None): """ Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag @@ -220,6 +228,7 @@ def style(self, text, max_len=119, min_indent=None): new_lines = [] paragraph = [] self.current_indent = "" + self.previous_indent = None # If one of those is True, the paragraph should not be touched (code samples, lists...) no_style = False no_style_next = False @@ -251,7 +260,7 @@ def style(self, text, max_len=119, min_indent=None): self.current_indent = indent elif not indent.startswith(self.current_indent): # If not, we are leaving the block when we unindent. - self.in_block = SpecialBlock.NOT_SPECIAL + self.end_of_special_style(paragraph[0]) if self.is_special_block(paragraph[0]): # Maybe we are starting a special block. @@ -326,6 +335,8 @@ def is_comment_or_textual_block(self, line): def is_special_block(self, line): if self.is_no_style_block(line): + if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST: + self.previous_indent = self.current_indent self.in_block = SpecialBlock.NO_STYLE return True if _re_arg_def.search(line) is not None: @@ -333,6 +344,14 @@ def is_special_block(self, line): return True return False + def end_of_special_style(self, line): + if self.previous_indent is not None and line.startswith(self.previous_indent): + self.in_block = SpecialBlock.ARG_LIST + self.current_indent = self.previous_indent + else: + self.in_block = SpecialBlock.NOT_SPECIAL + self.previous_indent = None + def init_in_block(self, text): lines = text.split("\n") while len(lines) > 0 and len(lines[0]) == 0: