huggingface · LysandreJik · Jan 5, 2021 · Dec 29, 2020 · Dec 29, 2020 · Jan 4, 2021
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -241,6 +241,14 @@ def __init__(
         if model is None and model_init is not None:
             model = self.call_model_init()
 
+        if self.args.model_parallel:
+            # XXX: ideally this register should be maintained elsewhere so that the trainer could just do
+            # if model.model_parallel_is_supported()
+            mp_supported = ["gpt2", "t5"]
+            assert (
+                model.config.model_type in mp_supported
+            ), f"{model.config.model_type} implementation currently doesn't support model parallelism, therefore --model_parallel cl arg cannot be used"
+
         # Model parallel
         if model is not None and not self.args.model_parallel:
             model = model.to(args.device)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -207,8 +207,8 @@ class TrainingArguments:
               :obj:`"eval_loss"`.
             - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
         model_parallel (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If there is more than one device, whether to use model parallelism to distribute the model's modules across
-            devices or not.
+            If the model supports model parallelism and there is more than one device, whether to use model parallelism
+            to distribute the model's modules across devices or not.
         ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
             stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping