Small fixes (#1215)

Summary: Pull Request resolved: fairinternal/fairseq-py#1215 Reviewed By: ngoyal2707, msbaines Differential Revision: D22514719 Pulled By: myleott fbshipit-source-id: 5f15ba501fd66af1eb49b5702aff940f06c3d91f
facebookresearch · Jul 14, 2020 · ffecb4e · ffecb4e
1 parent 5d88d37
commit ffecb4e
Show file tree

Hide file tree

Showing 26 changed files with 121 additions and 92 deletions.
diff --git a/docs/command_line_tools.rst b/docs/command_line_tools.rst
@@ -17,7 +17,7 @@ Fairseq provides several command-line tools for training and evaluating models:
 
 fairseq-preprocess
 ~~~~~~~~~~~~~~~~~~
-.. automodule:: preprocess
+.. automodule:: fairseq_cli.preprocess
 
     .. argparse::
         :module: fairseq.options
@@ -29,7 +29,7 @@ fairseq-preprocess
 
 fairseq-train
 ~~~~~~~~~~~~~
-.. automodule:: train
+.. automodule:: fairseq_cli.train
 
     .. argparse::
         :module: fairseq.options
@@ -41,7 +41,7 @@ fairseq-train
 
 fairseq-generate
 ~~~~~~~~~~~~~~~~
-.. automodule:: generate
+.. automodule:: fairseq_cli.generate
 
     .. argparse::
         :module: fairseq.options
@@ -53,7 +53,7 @@ fairseq-generate
 
 fairseq-interactive
 ~~~~~~~~~~~~~~~~~~~
-.. automodule:: interactive
+.. automodule:: fairseq_cli.interactive
 
     .. argparse::
         :module: fairseq.options
@@ -65,7 +65,7 @@ fairseq-interactive
 
 fairseq-score
 ~~~~~~~~~~~~~
-.. automodule:: score
+.. automodule:: fairseq_cli.score
 
     .. argparse::
         :module: fairseq_cli.score
@@ -77,7 +77,7 @@ fairseq-score
 
 fairseq-eval-lm
 ~~~~~~~~~~~~~~~
-.. automodule:: eval_lm
+.. automodule:: fairseq_cli.eval_lm
 
     .. argparse::
         :module: fairseq.options

diff --git a/examples/byte_level_bpe/README.md b/examples/byte_level_bpe/README.md
@@ -38,10 +38,10 @@ fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_le
 # BPE=--bpe bytes
 # BPE=--bpe characters
 BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model
-# BPE=--bpe sentencepiece --sentencepiece-vocab data/spm_bpe2048.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model
 # BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model
-# BPE=--bpe sentencepiece --sentencepiece-vocab data/spm_bpe4096.model
-# BPE=--bpe sentencepiece --sentencepiece-vocab data/spm_bpe16384.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model
+# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model
 ```
 
 ```bash

diff --git a/examples/byte_level_bpe/get_bitext.py b/examples/byte_level_bpe/get_bitext.py
@@ -95,8 +95,8 @@ def _apply_bbpe(model_path: str, in_path: str, out_path: str):
 
 
 def _apply_bpe(model_path: str, in_path: str, out_path: str):
-    Args = namedtuple('Args', ['sentencepiece_vocab'])
-    args = Args(sentencepiece_vocab=model_path)
+    Args = namedtuple('Args', ['sentencepiece_model'])
+    args = Args(sentencepiece_model=model_path)
     tokenizer = SentencepieceBPE(args)
     with open(in_path) as f, open(out_path, 'w') as f_o:
         for s in f:

diff --git a/examples/mbart/README.md b/examples/mbart/README.md
@@ -98,7 +98,7 @@ fairseq-generate path_2_data \
   --task translation_from_pretrained_bart \
   --gen-subset test \
   -t ro_RO -s en_XX \
-  --bpe 'sentencepiece' --sentencepiece-vocab sentence.bpe.model \
+  --bpe 'sentencepiece' --sentencepiece-model sentence.bpe.model \
   --sacrebleu --remove-bpe 'sentencepiece'\
   --max-sentences 32 --langs $langs > en_ro
 

diff --git a/examples/scaling_nmt/README.md b/examples/scaling_nmt/README.md
@@ -70,16 +70,36 @@ good, but you may need to adjust this depending on how long you've trained:
 ```bash
 python scripts/average_checkpoints \
     --inputs /path/to/checkpoints \
-    --num-epoch-checkpoints 5 \
-    --output checkpoint.avg5.pt
+    --num-epoch-checkpoints 10 \
+    --output checkpoint.avg10.pt
 ```
 
 Next, generate translations using a beam width of 4 and length penalty of 0.6:
 ```bash
 fairseq-generate \
     data-bin/wmt16_en_de_bpe32k \
-    --path checkpoint.avg5.pt \
-    --beam 4 --lenpen 0.6 --remove-bpe
+    --path checkpoint.avg10.pt \
+    --beam 4 --lenpen 0.6 --remove-bpe > gen.out
+```
+
+Finally, we apply the ["compound splitting" script](/scripts/compound_split_bleu.sh) to
+add spaces around dashes. For example "Café-Liebhaber" would become three tokens:
+"Café - Liebhaber". This typically results in larger BLEU scores, but it is not
+appropriate to compare these inflated scores to work which does not include this trick.
+This trick was used in the [original AIAYN code](https://github.com/tensorflow/tensor2tensor/blob/fc9335c0203685cbbfe2b30c92db4352d8f60779/tensor2tensor/utils/get_ende_bleu.sh),
+so we used it in the Scaling NMT paper as well. That said, it's strongly advised to
+report [sacrebleu](https://github.com/mjpost/sacrebleu) scores instead.
+
+To compute "compound split" tokenized BLEU (not recommended!):
+```bash
+bash scripts/compound_split_bleu.sh gen.out
+# BLEU4 = 29.29, 60.3/35.0/22.8/15.3 (BP=1.000, ratio=1.004, syslen=64763, reflen=64496)
+```
+
+To compute detokenized BLEU with sacrebleu (preferred):
+```bash
+bash scripts/sacrebleu.sh wmt14/full en de gen.out
+# BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt14/full+tok.13a+version.1.4.3 = 28.6 59.3/34.3/22.1/14.9 (BP = 1.000 ratio = 1.016 hyp_len = 63666 ref_len = 62688)
 ```
 
 ## Citation

diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md
@@ -30,7 +30,7 @@ Given a directory containing wav files to be used for pretraining (we recommend
 ### Prepare training data manifest:
 
 ```
-$ python scripts/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav
+$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav
 ```
 
 ### Train a wav2vec model:

diff --git a/fairseq/data/base_wrapper_dataset.py b/fairseq/data/base_wrapper_dataset.py
@@ -46,6 +46,23 @@ def supports_prefetch(self):
     def prefetch(self, indices):
         self.dataset.prefetch(indices)
 
+    def get_batch_shapes(self):
+        return self.dataset.get_batch_shapes()
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+
     def set_epoch(self, epoch):
         super().set_epoch(epoch)
         if hasattr(self.dataset, 'set_epoch'):

diff --git a/fairseq/data/encoders/hf_bert_bpe.py b/fairseq/data/encoders/hf_bert_bpe.py
@@ -21,12 +21,10 @@ def add_args(parser):
 
     def __init__(self, args):
         try:
-            from pytorch_transformers import BertTokenizer
-            from pytorch_transformers.tokenization_utils import clean_up_tokenization
+            from transformers import BertTokenizer
         except ImportError:
             raise ImportError(
-                'Please install 1.0.0 version of pytorch_transformers'
-                'with: pip install pytorch-transformers'
+                'Please install transformers with: pip install transformers'
             )
 
         if 'bpe_vocab_file' in args:
@@ -37,13 +35,12 @@ def __init__(self, args):
         else:
             vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
             self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
-            self.clean_up_tokenization = clean_up_tokenization
 
     def encode(self, x: str) -> str:
         return ' '.join(self.bert_tokenizer.tokenize(x))
 
     def decode(self, x: str) -> str:
-        return self.clean_up_tokenization(
+        return self.bert_tokenizer.clean_up_tokenization(
             self.bert_tokenizer.convert_tokens_to_string(x.split(' '))
         )
 

diff --git a/fairseq/data/encoders/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py
@@ -13,16 +13,16 @@ class SentencepieceBPE(object):
     @staticmethod
     def add_args(parser):
         # fmt: off
-        parser.add_argument('--sentencepiece-vocab', type=str,
-                            help='path to sentencepiece vocab')
+        parser.add_argument('--sentencepiece-model', type=str,
+                            help='path to sentencepiece model')
         # fmt: on
 
     def __init__(self, args):
-        vocab = file_utils.cached_path(args.sentencepiece_vocab)
+        sentencepiece_model = file_utils.cached_path(args.sentencepiece_model)
         try:
             import sentencepiece as spm
             self.sp = spm.SentencePieceProcessor()
-            self.sp.Load(vocab)
+            self.sp.Load(sentencepiece_model)
         except ImportError:
             raise ImportError('Please install sentencepiece with: pip install sentencepiece')
 

diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py
@@ -9,9 +9,9 @@
 
 class EpochListening:
     """Mixin for receiving updates whenever the epoch increments."""
+
     def set_epoch(self, epoch):
-        """Will receive the updated epoch number at the beginning of the epoch.
-        """
+        """Will receive the updated epoch number at the beginning of the epoch."""
         pass
 
 

diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py
@@ -489,7 +489,7 @@ def __next__(self):
             self._create_consumer()
 
         # Notify the user if there is a data loading bottleneck
-        if self._queue.qsize() < max(1, self._queue.maxsize // 2):
+        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
             if time.time() - self.start_time > 5 * 60:
                 if self.warning_time is None or time.time() - self.warning_time > 15 * 60:
                     logger.info(

diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py
@@ -73,7 +73,9 @@ def compute_alignment_weights(alignments):
         ]).index_select(0, sort_order)
         ntokens = tgt_lengths.sum().item()
 
-        if input_feeding:
+        if samples[0].get('prev_output_tokens', None) is not None:
+            prev_output_tokens = merge('prev_output_tokens', left_pad=left_pad_target)
+        elif input_feeding:
             # we create a shifted version of targets for feeding the
             # previous output token(s) into the next decoder step
             prev_output_tokens = merge(

diff --git a/fairseq/data/legacy/masked_lm_dictionary.py b/fairseq/data/legacy/masked_lm_dictionary.py
@@ -42,7 +42,7 @@ def __init__(
         cls='<cls>',
         sep='<sep>'
     ):
-        super().__init__(pad=pad, eos=eos, unk=unk)
+        super().__init__(pad=pad, eos=eos, unk=unk, mask=mask)
         self.cls_word = cls
         self.sep_word = sep
         self.cls_index = self.add_symbol(cls)

diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py
@@ -59,7 +59,7 @@ def from_pretrained(
     for file, arg in {
         'code': 'bpe_codes',
         'bpecodes': 'bpe_codes',
-        'sentencepiece.bpe.model': 'sentencepiece_vocab',
+        'sentencepiece.bpe.model': 'sentencepiece_model',
     }.items():
         path = os.path.join(model_path, file)
         if os.path.exists(path):

diff --git a/fairseq/logging/progress_bar.py b/fairseq/logging/progress_bar.py
@@ -275,7 +275,12 @@ class TqdmProgressBar(BaseProgressBar):
     def __init__(self, iterable, epoch=None, prefix=None):
         super().__init__(iterable, epoch, prefix)
         from tqdm import tqdm
-        self.tqdm = tqdm(iterable, self.prefix, leave=False)
+        self.tqdm = tqdm(
+            iterable,
+            self.prefix,
+            leave=False,
+            disable=(logger.getEffectiveLevel() > logging.INFO),
+        )
 
     def __iter__(self):
         return iter(self.tqdm)
@@ -287,7 +292,8 @@ def log(self, stats, tag=None, step=None):
     def print(self, stats, tag=None, step=None):
         """Print end-of-epoch stats."""
         postfix = self._str_pipes(self._format_stats(stats))
-        self.tqdm.write('{} | {}'.format(self.tqdm.desc, postfix))
+        with rename_logger(logger, tag):
+            logger.info('{} | {}'.format(self.prefix, postfix))
 
 
 try:

diff --git a/fairseq/model_parallel/models/transformer_lm.py b/fairseq/model_parallel/models/transformer_lm.py
@@ -3,22 +3,16 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
 import torch.nn as nn
 
-from fairseq import utils
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.transformer_lm import (
     base_lm_architecture,
     TransformerLanguageModel,
 )
 from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder
 try:
-    from fairseq.model_parallel.megatron.mpu import get_model_parallel_group
-    from fairseq.model_parallel.megatron.mpu import get_model_parallel_rank
-    from fairseq.model_parallel.megatron.mpu import get_model_parallel_world_size
     from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding
-    from fairseq.model_parallel.megatron.mpu.utils import VocabUtility
     has_megatron_submodule = True
 except (ImportError, ModuleNotFoundError):
     has_megatron_submodule = False

diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py
@@ -331,6 +331,11 @@ def __init__(self, args, dictionary, embed_tokens):
             else None
         )
 
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+
         if not args.adaptive_input and args.quant_noise_pq > 0:
             self.quant_noise = apply_quant_noise_(
                 nn.Linear(embed_dim, embed_dim, bias=False),
@@ -353,10 +358,6 @@ def __init__(self, args, dictionary, embed_tokens):
             self.layer_norm = LayerNorm(embed_dim)
         else:
             self.layer_norm = None
-        if getattr(args, "layernorm_embedding", False):
-            self.layernorm_embedding = LayerNorm(embed_dim)
-        else:
-            self.layernorm_embedding = None
 
     def build_encoder_layer(self, args):
         return TransformerEncoderLayer(args)