Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizers should be framework agnostic #8599

Merged
merged 5 commits into from
Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/self-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on:

jobs:
run_tests_torch_gpu:
runs-on: [self-hosted, single-gpu]
runs-on: [self-hosted, gpu, single-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:


run_tests_tf_gpu:
runs-on: [self-hosted, single-gpu]
runs-on: [self-hosted, gpu, single-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
path: reports

run_tests_torch_multi_gpu:
runs-on: [self-hosted, multi-gpu]
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
Expand Down Expand Up @@ -213,7 +213,7 @@ jobs:
path: reports

run_tests_tf_multi_gpu:
runs-on: [self-hosted, multi-gpu]
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ on:
push:
branches:
- ci_*
- framework-agnostic-tokenizers
repository_dispatch:
schedule:
- cron: "0 0 * * *"

jobs:
run_all_tests_torch_gpu:
runs-on: [self-hosted, single-gpu]
runs-on: [self-hosted, gpu, single-gpu]
steps:
- uses: actions/checkout@v2

Expand Down Expand Up @@ -109,7 +110,7 @@ jobs:


run_all_tests_tf_gpu:
runs-on: [self-hosted, single-gpu]
runs-on: [self-hosted, gpu, single-gpu]
steps:
- uses: actions/checkout@v2

Expand Down Expand Up @@ -188,7 +189,7 @@ jobs:
path: reports

run_all_tests_torch_multi_gpu:
runs-on: [self-hosted, multi-gpu]
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- uses: actions/checkout@v2

Expand Down Expand Up @@ -279,7 +280,7 @@ jobs:
path: reports

run_all_tests_tf_multi_gpu:
runs-on: [self-hosted, multi-gpu]
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- uses: actions/checkout@v2

Expand Down
4 changes: 2 additions & 2 deletions docs/source/model_doc/marian.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ require 3 character language codes:
tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text))
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# ["c'est une phrase en anglais que nous voulons traduire en français",
# 'Isto deve ir para o português.',
Expand Down Expand Up @@ -150,7 +150,7 @@ Example of translating english to many romance languages, using old-style 2 char
print(tokenizer.supported_language_codes)

model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text))
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.', 'Y esto al español']

Expand Down
4 changes: 2 additions & 2 deletions docs/source/model_doc/mbart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ the sequences for sequence-to-sequence fine-tuning.

example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian)
batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

- Generation
Expand All @@ -58,7 +58,7 @@ the sequences for sequence-to-sequence fine-tuning.
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
article = "UN Chief Says There Is No Military Solution in Syria"
batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX")
batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
Expand Down
2 changes: 1 addition & 1 deletion docs/source/model_doc/pegasus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Usage Example
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
Expand Down
2 changes: 1 addition & 1 deletion model_cards/tuner007/pegasus_paraphrase/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences):
batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60).to(torch_device)
batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=num_return_sequences, temperature=1.5)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
return tgt_text
Expand Down
2 changes: 1 addition & 1 deletion model_cards/tuner007/pegasus_qa/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_dev

def get_answer(question, context):
input_text = "question: %s text: %s" % (question,context)
batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest').to(torch_device)
batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
return tgt_text[0]
Expand Down
2 changes: 1 addition & 1 deletion scripts/fsmt/fsmt-make-super-tiny-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
print(f"num of params {tiny_model.num_parameters()}")

# Test
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"])
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"], return_tensors="pt")
outputs = tiny_model(**batch)

print("test output:", len(outputs.logits[0]))
Expand Down
2 changes: 1 addition & 1 deletion scripts/fsmt/fsmt-make-tiny-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
print(f"num of params {tiny_model.num_parameters()}")

# Test
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"])
batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"], return_tensors="pt")
outputs = tiny_model(**batch)

print("test output:", len(outputs.logits[0]))
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/bart/tokenization_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def prepare_seq2seq_batch(
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = "None",
return_tensors: str = None,
truncation=True,
**kwargs,
) -> BatchEncoding:
Expand Down Expand Up @@ -91,7 +91,7 @@ def prepare_seq2seq_batch(
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:

* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/bart/tokenization_bart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def prepare_seq2seq_batch(
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = "None",
return_tensors: str = None,
truncation=True,
**kwargs,
) -> BatchEncoding:
Expand Down Expand Up @@ -86,7 +86,7 @@ def prepare_seq2seq_batch(
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:

* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/fsmt/tokenization_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def prepare_seq2seq_batch(
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
return_tensors: str = None,
truncation=True,
padding="longest",
**unused,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/marian/modeling_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class MarianMTModel(BartForConditionalGeneration):

>>> model = MarianMTModel.from_pretrained(mname)
>>> tok = MarianTokenizer.from_pretrained(mname)
>>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text]) # don't need tgt_text for inference
>>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt") # don't need tgt_text for inference
>>> gen = model.generate(**batch) # for forward pass: model(**batch)
>>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the bus stop ?"

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/marian/tokenization_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class MarianTokenizer(PreTrainedTokenizer):
>>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
>>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
>>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts)
>>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
>>> # keys [input_ids, attention_mask, labels].
>>> # model(**batch) should work
"""
Expand Down Expand Up @@ -175,7 +175,7 @@ def prepare_seq2seq_batch(
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
return_tensors: str = None,
truncation=True,
padding="longest",
**unused,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mbart/modeling_mbart.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class MBartForConditionalGeneration(BartForConditionalGeneration):
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
>>> article = "UN Chief Says There Is No Military Solution in Syria"
>>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article])
>>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], return_tensors="pt")
>>> translated_tokens = model.generate(**batch)
>>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
>>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/mbart/tokenization_mbart.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> batch: dict = tokenizer.prepare_seq2seq_batch(
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
... )

"""
Expand Down Expand Up @@ -183,7 +183,7 @@ def prepare_seq2seq_batch(
max_target_length: Optional[int] = None,
truncation: bool = True,
padding: str = "longest",
return_tensors: str = "pt",
return_tensors: str = None,
add_prefix_space: bool = False, # ignored
**kwargs,
) -> BatchEncoding:
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/mbart/tokenization_mbart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> batch: dict = tokenizer.prepare_seq2seq_batch(
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
... )
"""

Expand Down Expand Up @@ -181,7 +181,7 @@ def prepare_seq2seq_batch(
max_target_length: Optional[int] = None,
truncation: bool = True,
padding: str = "longest",
return_tensors: str = "pt",
return_tensors: str = None,
**kwargs,
) -> BatchEncoding:
if max_length is None:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/pegasus/modeling_pegasus.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class PegasusForConditionalGeneration(BartForConditionalGeneration):

>>> model = PegasusForConditionalGeneration.from_pretrained(mname)
>>> tok = PegasusTokenizer.from_pretrained(mname)
>>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE]) # don't need tgt_text for inference
>>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE], return_tensors="pt") # don't need tgt_text for inference
>>> gen = model.generate(**batch) # for forward pass: model(**batch)
>>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True)
>>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers."
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/pegasus/tokenization_pegasus.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def prepare_seq2seq_batch(
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
return_tensors: str = None,
truncation=True,
padding="longest",
**unused,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def prepare_seq2seq_batch(
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
return_tensors: str = None,
truncation=True,
padding="longest",
**unused,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/rag/tokenization_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def prepare_seq2seq_batch(
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = "np",
return_tensors: str = None,
truncation=True,
**kwargs,
) -> BatchEncoding:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ def prepare_seq2seq_batch(
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:

* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1455,7 +1455,7 @@ def all_special_ids(self) -> List[int]:
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:

* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
Expand Down
20 changes: 13 additions & 7 deletions tests/test_modeling_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
self.assertListEqual(self.expected_text, generated_words)

def translate_src_text(self, **tokenizer_kwargs):
model_inputs = self.tokenizer.prepare_seq2seq_batch(src_texts=self.src_text, **tokenizer_kwargs).to(
torch_device
)
model_inputs = self.tokenizer.prepare_seq2seq_batch(
src_texts=self.src_text, return_tensors="pt", **tokenizer_kwargs
).to(torch_device)
self.assertEqual(self.model.device, model_inputs.input_ids.device)
generated_ids = self.model.generate(
model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
Expand All @@ -151,7 +151,9 @@ def test_forward(self):
src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
expected_ids = [38, 121, 14, 697, 38848, 0]

model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt).to(torch_device)
model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt, return_tensors="pt").to(
torch_device
)

self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())

Expand All @@ -171,12 +173,16 @@ def test_forward(self):

def test_unk_support(self):
t = self.tokenizer
ids = t.prepare_seq2seq_batch(["||"]).to(torch_device).input_ids[0].tolist()
ids = t.prepare_seq2seq_batch(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
self.assertEqual(expected, ids)

def test_pad_not_split(self):
input_ids_w_pad = self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"]).input_ids[0].tolist()
input_ids_w_pad = (
self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"], return_tensors="pt")
.input_ids[0]
.tolist()
)
expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0] # pad
self.assertListEqual(expected_w_pad, input_ids_w_pad)

Expand Down Expand Up @@ -294,7 +300,7 @@ def test_tokenizer_handles_empty(self):
normalized = self.tokenizer.normalize("")
self.assertIsInstance(normalized, str)
with self.assertRaises(ValueError):
self.tokenizer.prepare_seq2seq_batch([""])
self.tokenizer.prepare_seq2seq_batch([""], return_tensors="pt")

@slow
def test_pipeline(self):
Expand Down
Loading