diff --git a/src/super_gradients/common/sg_loggers/base_sg_logger.py b/src/super_gradients/common/sg_loggers/base_sg_logger.py index f9e1335e62..b00b64f32e 100644 --- a/src/super_gradients/common/sg_loggers/base_sg_logger.py +++ b/src/super_gradients/common/sg_loggers/base_sg_logger.py @@ -169,7 +169,7 @@ def add_scalars(self, tag_scalar_dict: dict, global_step: int = None): self.tensorboard_writer.flush() # WRITE THE EPOCH RESULTS TO LOG FILE - log_line = f"\nEpoch ({global_step}/{self.max_global_steps}) - " + log_line = f"\nEpoch {global_step} ({global_step+1}/{self.max_global_steps}) - " for tag, value in tag_scalar_dict.items(): if isinstance(value, torch.Tensor): value = value.item() @@ -249,6 +249,9 @@ def add_file(self, file_name: str = None): @multi_process_safe def upload(self): + """Upload the local tensorboard and log files to remote system.""" + self.flush() + if self.save_tensorboard_remote: self.model_checkpoints_data_interface.save_remote_tensorboard_event_files(self.experiment_name, self._local_dir) @@ -259,12 +262,16 @@ def upload(self): @multi_process_safe def flush(self): self.tensorboard_writer.flush() + ConsoleSink.flush() @multi_process_safe def close(self): + self.upload() + if self.system_monitor is not None: self.system_monitor.close() logger.info("[CLEANUP] - Successfully stopped system monitoring process") + self.tensorboard_writer.close() if self.tensor_board_process is not None: try: diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 675a4b0222..e1c4139b2c 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -468,9 +468,6 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: ): break - if not self.ddp_silent_mode: - self.sg_logger.upload() - self.train_monitored_values = sg_trainer_utils.update_monitored_values_dict( monitored_values_dict=self.train_monitored_values, new_values_dict=pbar_message_dict ) @@ -1315,6 +1312,7 @@ def forward(self, inputs, targets): if not self.ddp_silent_mode: # SAVING AND LOGGING OCCURS ONLY IN THE MAIN PROCESS (IN CASES THERE ARE SEVERAL PROCESSES - DDP) self._write_to_disk_operations(train_metrics_tuple, validation_results_tuple, inf_time, epoch, context) + self.sg_logger.upload() # Evaluating the average model and removing snapshot averaging file if training is completed if self.training_params.average_best_models: