Skip to content

Commit

Permalink
Push to hub save (#15327)
Browse files Browse the repository at this point in the history
* Adapt doc and push at every save

* style
  • Loading branch information
sgugger authored Jan 27, 2022
1 parent 9f831bd commit da5ef25
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
13 changes: 9 additions & 4 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def _tune_save_checkpoint(self):
return
with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
self.save_model(output_dir)
self.save_model(output_dir, _internal_call=True)
if self.args.should_save:
self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
Expand Down Expand Up @@ -1634,7 +1634,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
self.store_flos()

output_dir = os.path.join(run_dir, checkpoint_folder)
self.save_model(output_dir)
self.save_model(output_dir, _internal_call=True)
if self.deepspeed:
# under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
# config `stage3_gather_fp16_weights_on_model_save` is True
Expand Down Expand Up @@ -2002,7 +2002,7 @@ def is_world_process_zero(self) -> bool:
else:
return self.args.process_index == 0

def save_model(self, output_dir: Optional[str] = None):
def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
"""
Will save the model, so you can reload it using `from_pretrained()`.
Expand Down Expand Up @@ -2051,6 +2051,10 @@ def save_model(self, output_dir: Optional[str] = None):
elif self.args.should_save:
self._save(output_dir)

# Push to the Hub when `save_model` is called by the user.
if self.args.push_to_hub and not _internal_call:
self.push_to_hub(commit_message="Model save")

def _save_tpu(self, output_dir: Optional[str] = None):
output_dir = output_dir if output_dir is not None else self.args.output_dir
logger.info(f"Saving model checkpoint to {output_dir}")
Expand Down Expand Up @@ -2768,9 +2772,10 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
model_name = Path(self.args.output_dir).name
else:
model_name = self.args.hub_model_id.split("/")[-1]

# Needs to be executed on all processes for TPU training, but will only save on the processed determined by
# self.args.should_save.
self.save_model()
self.save_model(_internal_call=True)

# Only push from one node.
if not self.is_world_process_zero():
Expand Down
15 changes: 12 additions & 3 deletions src/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,9 +365,18 @@ class TrainingArguments:
Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
down the training and evaluation speed.
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to upload the trained model to the hub after training. If this is activated, and
`output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
Whether or not to push the model to the Hub every time the model is saved. If this is activated,
`output_dir` will begin a git directory synced with the the repo (determined by `hub_model_id`) and the
content will be pushed each time a save is triggered (depneding on your `save_strategy`). Calling
[`~Trainer.save_model`] will also trigger a push
<Tip warning={true}>
If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
pushed.
</Tip>
resume_from_checkpoint (`str`, *optional*):
The path to a folder with a valid checkpoint for your model. This argument is not directly used by
[`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
Expand All @@ -384,7 +393,7 @@ class TrainingArguments:
Defines the scope of what is pushed to the Hub and when. Possible values are:
- `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
draft of a model card at the end of training.
draft of a model card when the [`~Trainer.save_model`] method is called.
- `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
a draft of a model card each time there is a model save. The pushes are asynchronous to not block
training, and in case the save are very frequent, a new push is only attempted if the previous one is
Expand Down

0 comments on commit da5ef25

Please sign in to comment.