Skip to content

Commit f1ebda5

Browse files
authored
Feature/sg 644 upload big files properly on end (#671)
* wip * wip * wip * remove raise to debug * fix * undo unwanted change * improve display of experiment_log epoch index
1 parent e876dd7 commit f1ebda5

File tree

2 files changed

+9
-4
lines changed

2 files changed

+9
-4
lines changed

src/super_gradients/common/sg_loggers/base_sg_logger.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def add_scalars(self, tag_scalar_dict: dict, global_step: int = None):
169169
self.tensorboard_writer.flush()
170170

171171
# WRITE THE EPOCH RESULTS TO LOG FILE
172-
log_line = f"\nEpoch ({global_step}/{self.max_global_steps}) - "
172+
log_line = f"\nEpoch {global_step} ({global_step+1}/{self.max_global_steps}) - "
173173
for tag, value in tag_scalar_dict.items():
174174
if isinstance(value, torch.Tensor):
175175
value = value.item()
@@ -249,6 +249,9 @@ def add_file(self, file_name: str = None):
249249

250250
@multi_process_safe
251251
def upload(self):
252+
"""Upload the local tensorboard and log files to remote system."""
253+
self.flush()
254+
252255
if self.save_tensorboard_remote:
253256
self.model_checkpoints_data_interface.save_remote_tensorboard_event_files(self.experiment_name, self._local_dir)
254257

@@ -259,12 +262,16 @@ def upload(self):
259262
@multi_process_safe
260263
def flush(self):
261264
self.tensorboard_writer.flush()
265+
ConsoleSink.flush()
262266

263267
@multi_process_safe
264268
def close(self):
269+
self.upload()
270+
265271
if self.system_monitor is not None:
266272
self.system_monitor.close()
267273
logger.info("[CLEANUP] - Successfully stopped system monitoring process")
274+
268275
self.tensorboard_writer.close()
269276
if self.tensor_board_process is not None:
270277
try:

src/super_gradients/training/sg_trainer/sg_trainer.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -468,9 +468,6 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple:
468468
):
469469
break
470470

471-
if not self.ddp_silent_mode:
472-
self.sg_logger.upload()
473-
474471
self.train_monitored_values = sg_trainer_utils.update_monitored_values_dict(
475472
monitored_values_dict=self.train_monitored_values, new_values_dict=pbar_message_dict
476473
)
@@ -1315,6 +1312,7 @@ def forward(self, inputs, targets):
13151312
if not self.ddp_silent_mode:
13161313
# SAVING AND LOGGING OCCURS ONLY IN THE MAIN PROCESS (IN CASES THERE ARE SEVERAL PROCESSES - DDP)
13171314
self._write_to_disk_operations(train_metrics_tuple, validation_results_tuple, inf_time, epoch, context)
1315+
self.sg_logger.upload()
13181316

13191317
# Evaluating the average model and removing snapshot averaging file if training is completed
13201318
if self.training_params.average_best_models:

0 commit comments

Comments
 (0)