From f738bac538467fb445b1bd0739bf503d19cd933c Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Fri, 19 May 2023 07:56:55 +0000 Subject: [PATCH 01/24] init --- training/benchmarks/efficientnet/README.md | 24 +++ .../efficientnet/pytorch/config/__init__.py | 2 + .../efficientnet/pytorch/config/_base.py | 76 +++++++ .../pytorch/config/mutable_params.py | 7 + .../pytorch/dataloaders/__init__.py | 2 + .../pytorch/dataloaders/dataloader.py | 99 +++++++++ .../efficientnet/pytorch/model/__init__.py | 7 + .../efficientnet/pytorch/run_pretraining.py | 119 +++++++++++ .../pytorch/schedulers/__init__.py | 10 + .../efficientnet/pytorch/train/__init__.py | 0 .../efficientnet/pytorch/train/evaluator.py | 44 ++++ .../efficientnet/pytorch/train/trainer.py | 200 ++++++++++++++++++ .../pytorch/train/trainer_adapter.py | 47 ++++ .../pytorch/train/training_state.py | 74 +++++++ .../kunlunxin/efficientnet-pytorch/README.md | 33 +++ .../config/config_R300x1x1.py | 11 + .../config/config_R300x1x2.py | 6 + .../config/config_R300x1x4.py | 6 + .../config/config_R300x1x8.py | 6 + .../config/config_common.py | 3 + .../extern/trainer_adapter.py | 53 +++++ .../nvidia/efficientnet-pytorch/README.md | 30 +++ .../config/config_A100x1x1.py | 4 + .../config/config_A100x1x2.py | 6 + .../config/config_A100x1x4.py | 6 + .../config/config_A100x1x8.py | 6 + .../config/config_common.py | 3 + 27 files changed, 884 insertions(+) create mode 100644 training/benchmarks/efficientnet/README.md create mode 100644 training/benchmarks/efficientnet/pytorch/config/__init__.py create mode 100644 training/benchmarks/efficientnet/pytorch/config/_base.py create mode 100644 training/benchmarks/efficientnet/pytorch/config/mutable_params.py create mode 100644 training/benchmarks/efficientnet/pytorch/dataloaders/__init__.py create mode 100644 training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py create mode 100644 training/benchmarks/efficientnet/pytorch/model/__init__.py create mode 100755 training/benchmarks/efficientnet/pytorch/run_pretraining.py create mode 100644 training/benchmarks/efficientnet/pytorch/schedulers/__init__.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/__init__.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/evaluator.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/trainer.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/training_state.py create mode 100644 training/kunlunxin/efficientnet-pytorch/README.md create mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py create mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py create mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py create mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_common.py create mode 100644 training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py create mode 100644 training/nvidia/efficientnet-pytorch/README.md create mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py create mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py create mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py create mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py create mode 100644 training/nvidia/efficientnet-pytorch/config/config_common.py diff --git a/training/benchmarks/efficientnet/README.md b/training/benchmarks/efficientnet/README.md new file mode 100644 index 000000000..ce707c0e4 --- /dev/null +++ b/training/benchmarks/efficientnet/README.md @@ -0,0 +1,24 @@ +### 模型信息 +- 模型介绍 +>MobileNet-v2 is a convolutional neural network that is 53 layers deep. You can load a pretrained version of the network trained on more than a million images from the ImageNet database. The pretrained network can classify images into 1000 object categories, such as keyboard, mouse, pencil, and many animals. +>Refer to Sandler, M., Howard, A., Zhu, M., Zhmoginov, A. and Chen, L.C. "MobileNetV2: Inverted Residuals and Linear Bottlenecks." In 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 4510-4520). IEEE. + +- 模型代码来源 +> https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py + +### 数据集 +- 数据集下载地址 +> `https://image-net.org/download.php` (Imagenet2012 1K) + +- 预处理 +> 无需预处理 + + +### 框架与芯片支持情况 +| | Pytorch |Paddle|TensorFlow2| +| ---- | ---- | ---- | ---- | +| Nvidia GPU | ✅ |N/A |N/A| +| 昆仑芯 XPU | ✅ |N/A |N/A| + + + diff --git a/training/benchmarks/efficientnet/pytorch/config/__init__.py b/training/benchmarks/efficientnet/pytorch/config/__init__.py new file mode 100644 index 000000000..96e0aae70 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/config/__init__.py @@ -0,0 +1,2 @@ +from ._base import * +from .mutable_params import mutable_params diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py new file mode 100644 index 000000000..24c1bcf8d --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -0,0 +1,76 @@ +from typing import ClassVar +#from train.event.base import BaseTrainingEventInterface + +# case info +# chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required. +vendor: str = None +# model name +name: str = "MobileNetV2" + +do_train = True +fp16 = True +# ========================================================= +# data +# ========================================================= +data_dir: str = None +train_data: str = "train" +eval_data: str = "val" +output_dir: str = "" +init_checkpoint: str = "" + +# ========================================================= +# train && evaluate +# ========================================================= +train_batch_size: int = 8 +eval_batch_size: int = 8 +dist_backend: str = 'nccl' + +lr: float = 0.045 +lr_step_size: int = 1 +lr_gamma: float = 0.98 + +weight_decay: float = 0.00004 +gradient_accumulation_steps: int = 1 +momentum: float = 0.9 + +max_steps: int = 5005 * 300 # 300 epoch +seed: int = 41 + +# Stop training after reaching this accuracy +target_acc1: float = 70.634 + +# Sample to begin performing eval. +eval_iter_start_samples: int = 100 + +# If set to -1, disable eval, else evaluate every eval_iter_samples during training +eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch + +# Total number of training samples to run. +max_samples_termination: float = 5005 * 256 * 300 # 300 epoch + +# number workers for dataloader +num_workers: int = 16 + +# local_rank for distributed training on gpus +local_rank: int = 0 +# Whether to read local rank from ENVVAR +use_env: bool = True + +# Number of epochs to plan seeds for. Same set across all workers. +num_epochs_to_generate_seeds_for: int = 2 + +# frequency of logging loss. If not positive, no logging is provided for training loss +log_freq: int = 10 + +# Whether to resume training from checkpoint. +# If set, precedes init_checkpoint/init_tf_checkpoint +resume_from_checkpoint: bool = False + +# A object to provide some core components in training +#training_event: ClassVar[BaseTrainingEventInterface] = None + +#training_event_instance: BaseTrainingEventInterface = None + +# device +device: str = None +n_device: int = 1 diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py new file mode 100644 index 000000000..79d06e120 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -0,0 +1,7 @@ +mutable_params = [ + 'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size', + 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', + 'gradient_accumulation_steps', 'max_samples_termination', "vendor" +] + +mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"] diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/__init__.py b/training/benchmarks/efficientnet/pytorch/dataloaders/__init__.py new file mode 100644 index 000000000..af39fa5fb --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/__init__.py @@ -0,0 +1,2 @@ +from .dataloader import (build_train_dataset, build_eval_dataset, + build_train_dataloader, build_eval_dataloader) diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py new file mode 100644 index 000000000..0e9edf515 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py @@ -0,0 +1,99 @@ +# coding=utf-8 + +import os +import sys +import random +import numpy as np +import torch +from torch.utils.data import Dataset +from torchvision import datasets, models, transforms +import torch.distributed as dist +from torch.utils.data.dataloader import default_collate + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) +from driver import dist_pytorch + + +def build_train_dataset(args): + traindir = os.path.join(args.data_dir, args.train_data) + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + return train_dataset + + +def build_eval_dataset(args): + valdir = os.path.join(args.data_dir, args.eval_data) + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + return val_dataset + + +def build_train_dataloader(train_dataset, args): + """Traing dataloaders.""" + dist_pytorch.main_proc_print('building train dataloaders ...') + + if torch.distributed.is_available() and torch.distributed.is_initialized(): + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset) + dist_pytorch.main_proc_print( + f"use sampler: DistributedSampler, num_replicas:{args.n_device}") + else: + train_sampler = None + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=(train_sampler is None), + num_workers=args.num_workers, + pin_memory=True, + sampler=train_sampler) + + dist_pytorch.main_proc_print( + f'train samples:{len(train_dataset)}, batch size:{args.train_batch_size}' + ) + return train_dataloader + + +def build_eval_dataloader(eval_dataset, args): + """Traing and validation dataloaders.""" + dist_pytorch.main_proc_print('building eval dataloaders ...') + + if torch.distributed.is_available() and torch.distributed.is_initialized(): + val_sampler = torch.utils.data.distributed.DistributedSampler( + eval_dataset, shuffle=False, drop_last=True) + dist_pytorch.main_proc_print( + f"use sampler: DistributedSampler, num_replicas:{args.n_device}") + else: + val_sampler = None + + eval_dataloader = torch.utils.data.DataLoader( + eval_dataset, + batch_size=args.eval_batch_size, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + sampler=val_sampler) + + dist_pytorch.main_proc_print( + f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}') + return eval_dataloader diff --git a/training/benchmarks/efficientnet/pytorch/model/__init__.py b/training/benchmarks/efficientnet/pytorch/model/__init__.py new file mode 100644 index 000000000..76acf3d3d --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/model/__init__.py @@ -0,0 +1,7 @@ +import torch +import torchvision + + +def create_model(config): + model = torchvision.models.mobilenet_v2() + return model diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py new file mode 100755 index 000000000..ef89d98b3 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -0,0 +1,119 @@ +"""Mobilenet V2 Pretraining""" + +import os +import sys +import time +from typing import Any, Tuple + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +import config +from driver import Event, dist_pytorch +from driver.helper import InitHelper +from train import trainer_adapter +from train.evaluator import Evaluator +from train.trainer import Trainer +from train.training_state import TrainingState +from dataloaders.dataloader import build_train_dataset, \ + build_eval_dataset, build_train_dataloader, build_eval_dataloader + +logger = None + + +def main() -> Tuple[Any, Any]: + global logger + global config + init_helper = InitHelper(config) + model_driver = init_helper.init_driver(globals(), locals()) + config = model_driver.config + dist_pytorch.init_dist_training_env(config) + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.INIT_START) + + logger = model_driver.logger + init_start_time = logger.previous_log_time + + init_helper.set_seed(config.seed, config.vendor) + + train_dataset = build_train_dataset(config) + eval_dataset = build_eval_dataset(config) + train_dataloader = build_train_dataloader(train_dataset, config) + eval_dataloader = build_eval_dataloader(eval_dataset, config) + + evaluator = Evaluator(config, eval_dataloader) + + training_state = TrainingState() + + trainer = Trainer(driver=model_driver, + adapter=trainer_adapter, + evaluator=evaluator, + training_state=training_state, + device=config.device, + config=config) + training_state._trainer = trainer + + dist_pytorch.barrier(config.vendor) + trainer.init() + dist_pytorch.barrier(config.vendor) + + init_evaluation_start = time.time() + training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate( + trainer) + + init_evaluation_end = time.time() + init_evaluation_info = dict(eval_acc1=training_state.eval_acc1, + eval_acc5=training_state.eval_acc5, + time=init_evaluation_end - + init_evaluation_start) + model_driver.event(Event.INIT_EVALUATION, init_evaluation_info) + + if not config.do_train: + return config, training_state + + model_driver.event(Event.INIT_END) + init_end_time = logger.previous_log_time + training_state.init_time = (init_end_time - init_start_time) / 1e+3 + + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.TRAIN_START) + raw_train_start_time = logger.previous_log_time + + epoch = -1 + while training_state.global_steps < config.max_steps and \ + not training_state.end_training: + epoch += 1 + training_state.epoch = epoch + trainer.train_one_epoch(train_dataloader) + + model_driver.event(Event.TRAIN_END) + raw_train_end_time = logger.previous_log_time + + training_state.raw_train_time = (raw_train_end_time - + raw_train_start_time) / 1e+3 + + return config, training_state + + +if __name__ == "__main__": + start = time.time() + config_update, state = main() + if not dist_pytorch.is_main_process(): + sys.exit(0) + + global_batch_size = dist_pytorch.global_batch_size(config_update) + e2e_time = time.time() - start + finished_info = {"e2e_time": e2e_time} + if config_update.do_train: + training_perf = (global_batch_size * + state.global_steps) / state.raw_train_time + finished_info = { + "e2e_time": e2e_time, + "training_images_per_second": training_perf, + "converged": state.converged, + "final_loss": state.eval_loss, + "final_acc1": state.eval_acc1, + "final_acc5": state.eval_acc5, + "raw_train_time": state.raw_train_time, + "init_time": state.init_time, + } + logger.log(Event.FINISHED, message=finished_info, stacklevel=0) diff --git a/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py new file mode 100644 index 000000000..5421acb42 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py @@ -0,0 +1,10 @@ +from torch.optim.lr_scheduler import StepLR + + +def create_scheduler(optimizer, args): + """Build the learning rate scheduler.""" + + lr_scheduler = StepLR(optimizer, + step_size=args.lr_step_size, + gamma=args.lr_gamma) + return lr_scheduler diff --git a/training/benchmarks/efficientnet/pytorch/train/__init__.py b/training/benchmarks/efficientnet/pytorch/train/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py new file mode 100644 index 000000000..5dc3f0417 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -0,0 +1,44 @@ +import torch +import torch.distributed as dist + + +class Evaluator: + + def __init__(self, args, dataloader): + self.dataloader = dataloader + self.args = args + self.total_loss = 0.0 + self.total_acc1 = 0.0 + self.total_acc5 = 0.0 + self.total_batch = 0 + + def __update(self, loss, acc1, acc5): + self.total_loss += loss + self.total_acc1 += acc1 + self.total_acc5 += acc5 + self.total_batch += 1 + + def evaluate(self, trainer): + self.total_loss, self.total_acc1, self.total_acc5 = 0.0, 0.0, 0.0 + self.total_batch = 0 + with torch.no_grad(): + for i, batch in enumerate(self.dataloader): + batch = trainer.process_batch(batch, self.args.device) + loss, acc1, acc5 = trainer.inference(batch) + self.__update(loss.item(), acc1.item(), acc5.item()) + + if dist.is_available() and dist.is_initialized(): + total = torch.tensor([ + self.total_loss, self.total_acc1, self.total_acc5, + self.total_batch + ], + dtype=torch.float32, + device=self.args.device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.total_loss, self.total_acc1, self.total_acc5, self.total_batch = total.tolist( + ) + + loss = self.total_loss / self.total_batch + acc1 = self.total_acc1 / self.total_batch + acc5 = self.total_acc5 / self.total_batch + return loss, acc1, acc5 diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py new file mode 100644 index 000000000..3402c5b83 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -0,0 +1,200 @@ +import torch +from torch.types import Device +import torch.distributed as dist +import os +import sys +import time +import math + +from model import create_model +from schedulers import create_scheduler + +from train.evaluator import Evaluator +from train.training_state import TrainingState + +import config + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import Driver, Event, dist_pytorch + + +def accuracy(output, target, topk=(1, )): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +class Trainer: + + def __init__(self, driver: Driver, adapter, evaluator: Evaluator, + training_state: TrainingState, device: Device, config): + super(Trainer, self).__init__() + self.driver = driver + self.adapter = adapter + self.training_state = training_state + self.grad_scaler = None + + self.device = device + self.optimizer = None + self.config = config + self.model = None + self.evaluator = evaluator + self.lr_scheduler = None + self.global_batch_size = None + self.overflow_buf = None + + def init(self): + self.model = create_model(config) + self.model = self._init_model(self.model, self.config, self.device) + self.model = self.adapter.convert_model(self.model) + self.model = self.adapter.model_to_fp16(self.model) + self.optimizer = self.adapter.create_optimizer(self.model, self.config) + self.model = self.adapter.model_to_ddp(self.model) + self.lr_scheduler = create_scheduler(self.optimizer, self.config) + self.grad_scaler = self.adapter.create_grad_scaler() + + def _init_model(self, model, args, device): + checkpoint_name = config.init_checkpoint + if os.path.isfile(checkpoint_name): + print('checkpoint_name', checkpoint_name) + print('global rank {} is loading pretrained model {}'.format( + dist_pytorch.get_rank(), checkpoint_name)) + # Load the checkpoint. + checkpoint = torch.load(checkpoint_name, map_location='cpu') + model.load_state_dict(checkpoint['state_dict']) + + model = model.to(device) + return model + + def train_one_epoch(self, dataloader): + state = self.training_state + driver = self.driver + driver.event(Event.EPOCH_BEGIN, state.epoch) + + step_start_time = time.time() + epoch_start_num_sample = state.num_trained_samples + + for batch_idx, batch in enumerate(dataloader): + + state.global_steps += 1 + # TODO: Maybe we should update num_trained_samples after all epochs. + state.num_trained_samples = state.global_steps * \ + dist_pytorch.global_batch_size(self.config) + + driver.event(Event.STEP_BEGIN, step=state.global_steps) + self.train_one_step(batch) + + other_state = dict() + if state.global_steps % self.config.gradient_accumulation_steps == 0: + step_end_time = time.time() + step_total_time = step_end_time - step_start_time + step_start_time = step_end_time + images_per_second = ( + dist_pytorch.global_batch_size(self.config) * + self.config.gradient_accumulation_steps) / step_total_time + other_state["img/s"] = images_per_second + if hasattr(self.optimizer, 'loss_scaler'): + loss_scale = self.optimizer.loss_scaler.loss_scale + other_state['loss_scale'] = loss_scale + + eval_result = None + if self.can_do_eval(state): + eval_start = time.time() + state.eval_loss, state.eval_acc1, state.eval_acc5 = self.evaluator.evaluate( + self) + eval_end = time.time() + eval_result = dict(global_steps=state.global_steps, + eval_loss=state.eval_loss, + eval_acc1=state.eval_acc1, + eval_acc5=state.eval_acc5, + time=eval_end - eval_start) + + end_training = self.detect_training_status(state) + step_info = state.to_dict(**other_state) + driver.event(Event.STEP_END, + message=step_info, + step=state.global_steps, + loss=state.loss) + + if eval_result is not None: + driver.event(Event.EVALUATE, eval_result) + + if end_training: + break + + epoch_start_num_sample += len(dataloader.dataset) + state.num_trained_samples = epoch_start_num_sample + + self.lr_scheduler.step() + driver.event(Event.EPOCH_END, state.epoch) + + def train_one_step(self, batch): + # move data to the same device as model + batch = self.process_batch(batch, self.config.device) + state = self.training_state + self.model.train() + state.loss, state.acc1, state.acc5 = self.forward(batch) + self.adapter.backward(state.global_steps, state.loss, self.optimizer) + if dist.is_available() and dist.is_initialized(): + total = torch.tensor([state.loss, state.acc1, state.acc5], + dtype=torch.float32, + device=self.config.device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + total = total / dist.get_world_size() + state.loss, state.acc1, state.acc5 = total.tolist() + self.driver.event(Event.BACKWARD, state.global_steps, state.loss, + self.optimizer, self.grad_scaler) + + def detect_training_status(self, state): + config = self.config + if state.eval_acc1 >= config.target_acc1: + state.converged_success() + + if state.num_trained_samples > config.max_samples_termination: + state.end_training = True + + return state.end_training + + def can_do_eval(self, state): + config = self.config + do_eval = all([ + config.eval_data is not None, + state.num_trained_samples >= config.eval_iter_start_samples, + state.global_steps % + math.ceil(config.eval_interval_samples / + dist_pytorch.global_batch_size(config)) == 0, + config.eval_interval_samples > 0, + state.global_steps > 1, + ]) + + return do_eval or state.num_trained_samples >= config.max_samples_termination + + def forward(self, batch): + images, target = batch + output = self.model(images) + criterion = torch.nn.CrossEntropyLoss() + loss = criterion(output, target) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + return loss, acc1, acc5 + + def inference(self, batch): + self.model.eval() + output = self.forward(batch) + return output + + def process_batch(self, batch, device): + """Process batch and produce inputs for the model.""" + batch = tuple(t.to(device, non_blocking=True) for t in batch) + return batch diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py new file mode 100644 index 000000000..11e1ba5e7 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -0,0 +1,47 @@ +import torch +import torch.distributed as dist +from torch.optim import Optimizer +import config + +from torch import nn, Tensor +from driver.dist_pytorch import main_proc_print +from typing import Tuple +from torch.nn.parallel import DistributedDataParallel as DDP + + +def convert_model(model: nn.Module) -> nn.Module: + return model + + +def create_optimizer(model, args): + optimizer = torch.optim.SGD(model.parameters(), + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + return optimizer + + +def model_to_fp16(model): + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if config.fp16: + main_proc_print(" > use fp16...") + model.half() + return model + + +def model_to_ddp(model: nn.Module) -> nn.Module: + if dist.is_available() and dist.is_initialized(): + model = DDP(model, device_ids=[config.local_rank]) + return model + + +def create_grad_scaler(): + return None + + +def backward(step: int, loss: torch.Tensor, optimizer: Optimizer): + loss.backward() + update_step = step % config.gradient_accumulation_steps == 0 + if update_step: + optimizer.step() + optimizer.zero_grad() diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py new file mode 100644 index 000000000..2e5a1fca8 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/train/training_state.py @@ -0,0 +1,74 @@ +from dataclasses import dataclass +import inspect +import torch + + +@dataclass +class TrainingState: + _trainer = None + _status = 'aborted' # later set to 'success' if termination criteria met + + global_steps = 0 + skipped_steps = 0 + iter_dataloader_idx = 0 + + loss: float = 0.0 + acc1: float = 0.0 + acc5: float = 0.0 + + eval_loss: float = 0.0 + eval_acc1: float = 0.0 + eval_acc5: float = 0.0 + + epoch: int = 1 + num_trained_samples = 0 + end_training: bool = False + converged: bool = False + + init_time = 0 + raw_train_time = 0 + + def status(self): + if self.converged: + self._status = "success" + return self._status + + def converged_success(self): + self.end_training = True + self.converged = True + + def _is_property(self, value): + status = [ + not callable(value), not inspect.isclass(value), + not inspect.ismodule(value), not inspect.ismethod(value), + not inspect.isfunction(value), not inspect.isbuiltin(value), + "classmethod object" not in str(value) + ] + return all(status) + + def to_dict(self, **kwargs): + state_dict = dict() + + for var_name, value in self.__dict__.items(): + if not var_name.startswith("_") and self._is_property(value): + state_dict[var_name] = value + + lr = self._trainer.lr_scheduler.get_last_lr() + if isinstance(lr, (tuple, list)): + lr = lr[0] + state_dict["learning_rate"] = lr + exclude = [ + "eval_loss", "eval_acc1", "eval_acc5", "skipped_steps", + "converged", "init_time", "raw_train_time" + ] + for exkey in exclude: + if exkey in state_dict: + state_dict.pop(exkey) + + state_dict.update(kwargs) + + for k in state_dict.keys(): + if torch.is_tensor(state_dict[k]): + state_dict[k] = state_dict[k].item() + + return state_dict diff --git a/training/kunlunxin/efficientnet-pytorch/README.md b/training/kunlunxin/efficientnet-pytorch/README.md new file mode 100644 index 000000000..a547c7d3a --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/README.md @@ -0,0 +1,33 @@ +### 模型Checkpoint下载 +[模型Checkpoint下载](../../benchmarks/mobilenetv2/README.md#模型checkpoint) +### 测试数据集下载 +[测试数据集下载](../../benchmarks/mobilenetv2/README.md#数据集) + +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu18.04:v0.04 + - 训练框架版本:xmlir+e70db8f6 + - 依赖软件版本:pytorch-1.12.1+cpu + + +### 运行情况 +| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | +| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | +| 单机1卡 | config_R300x1x1 | | | | | | +| 单机2卡 | config_R300x1x2 | | | | | | +| 单机4卡 | config_R300x1x4 | | | | | | +| 单机8卡 | config_R300x1x8 | | 70.634 | 69.549 | 1501500 | | +| 两机8卡 | config_R300x2x8 | | | | | | + +### 许可证 + +Apache 2.0 license。 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..461017de5 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py @@ -0,0 +1,11 @@ +from config_common import * + +train_batch_size = 256 +eval_batch_size = 128 + +lr = 0.1 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 10 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py new file mode 100644 index 000000000..fb4a39e51 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 128 +eval_batch_size = 128 + +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py new file mode 100644 index 000000000..0b08a1cf4 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 64 +eval_batch_size = 32 + +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py new file mode 100644 index 000000000..6fb81b718 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 32 +eval_batch_size = 16 + +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_common.py b/training/kunlunxin/efficientnet-pytorch/config/config_common.py new file mode 100644 index 000000000..012668a0a --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/config/config_common.py @@ -0,0 +1,3 @@ +vendor = "kunlunxin" +dist_backend = "xccl" +fp16 = False \ No newline at end of file diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..96fd7c783 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py @@ -0,0 +1,53 @@ +import os +from port_for import is_available +import torch +import torch.distributed as dist +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +import config + +from torch import nn, Tensor +from driver.dist_pytorch import main_proc_print +from typing import Tuple + +from torch_xmlir.optimizer import SGD +import torch_xmlir.core.xpu_model as xm + + +def convert_model(model: nn.Module) -> nn.Module: + return model + + +def create_optimizer(model, args): + optimizer = SGD(model.parameters(), + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + return optimizer + + +def model_to_fp16(model): + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if config.fp16: + main_proc_print(" > use fp16...") + model.half() + return model + + +def model_to_ddp(model: nn.Module) -> nn.Module: + if dist.is_available() and dist.is_initialized(): + from torch.nn.parallel import DistributedDataParallel as DDP + model = DDP(model) + return model + + +def create_grad_scaler(): + return None + + +def backward(step: int, loss: torch.Tensor, optimizer: Optimizer): + loss.backward() + update_step = step % config.gradient_accumulation_steps == 0 + if update_step: + optimizer.step() + optimizer.zero_grad() diff --git a/training/nvidia/efficientnet-pytorch/README.md b/training/nvidia/efficientnet-pytorch/README.md new file mode 100644 index 000000000..af9acab76 --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/README.md @@ -0,0 +1,30 @@ +### 测试数据集下载 +[测试数据集下载](../../benchmarks/mobilenetv2/README.md#数据集) + +### Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.8.0a0+52ea372 + - 依赖软件版本:无 + + +### 运行情况 +| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | +| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | +| 单机1卡 | config_A100x1x1 | | | | | | +| 单机2卡 | config_A100x1x2 | | | | | | +| 单机4卡 | config_A100x1x4 | | | | | | +| 单机8卡 | config_A100x1x8 | 94208.62 | 70.634 | 70.634 | 1501500 | 4081.72 | +| 两机8卡 | config_A100x2x8 | | | | | | + +### 许可证 + + +本项目基于Apache 2.0 license。 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py new file mode 100644 index 000000000..d6f1e735e --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 256 +eval_batch_size = 128 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py new file mode 100644 index 000000000..fb4a39e51 --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 128 +eval_batch_size = 128 + +gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py new file mode 100644 index 000000000..0b08a1cf4 --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 64 +eval_batch_size = 32 + +gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py new file mode 100644 index 000000000..6fb81b718 --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 32 +eval_batch_size = 16 + +gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_common.py b/training/nvidia/efficientnet-pytorch/config/config_common.py new file mode 100644 index 000000000..af56e1cb3 --- /dev/null +++ b/training/nvidia/efficientnet-pytorch/config/config_common.py @@ -0,0 +1,3 @@ +vendor = "nvidia" +dist_backend = "nccl" +fp16 = False \ No newline at end of file From b494ba5d9d964d323e03db35405f418907917eb8 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 04:17:30 +0000 Subject: [PATCH 02/24] add efficientnet --- .../efficientnet/pytorch/config/_base.py | 129 ++++- .../pytorch/config/mutable_params.py | 3 +- .../pytorch/dataloaders/dataloader.py | 96 ++-- .../pytorch/dataloaders/presets.py | 71 +++ .../pytorch/dataloaders/sampler.py | 62 +++ .../pytorch/dataloaders/transforms.py | 183 +++++++ .../efficientnet/pytorch/model/__init__.py | 2 +- .../efficientnet/pytorch/run_pretraining.py | 13 +- .../pytorch/schedulers/__init__.py | 40 +- .../efficientnet/pytorch/train/evaluator.py | 27 +- .../efficientnet/pytorch/train/trainer.py | 56 +-- .../pytorch/train/trainer_adapter.py | 77 ++- .../efficientnet/pytorch/train/utils.py | 465 ++++++++++++++++++ .../config/config_R300x1x1.py | 11 +- .../config/config_R300x1x8.py | 4 +- .../config/config_common.py | 25 +- .../extern/trainer_adapter.py | 48 +- .../config/config_common.py | 25 +- 18 files changed, 1131 insertions(+), 206 deletions(-) create mode 100644 training/benchmarks/efficientnet/pytorch/dataloaders/presets.py create mode 100644 training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py create mode 100644 training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py create mode 100644 training/benchmarks/efficientnet/pytorch/train/utils.py diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py index 24c1bcf8d..9319137f1 100644 --- a/training/benchmarks/efficientnet/pytorch/config/_base.py +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -1,11 +1,8 @@ -from typing import ClassVar -#from train.event.base import BaseTrainingEventInterface - # case info # chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required. vendor: str = None # model name -name: str = "MobileNetV2" +name: str = "EfficientNet" do_train = True fp16 = True @@ -20,36 +17,121 @@ # ========================================================= # train && evaluate +# reference to https://github.com/pytorch/vision/tree/main/references/classification # ========================================================= -train_batch_size: int = 8 -eval_batch_size: int = 8 +train_batch_size: int = 256 +eval_batch_size: int = 256 dist_backend: str = 'nccl' -lr: float = 0.045 -lr_step_size: int = 1 -lr_gamma: float = 0.98 +# number of total epochs to run +epochs: int = 90 + +# number workers for dataloader +num_workers: int = 16 + +# optimizer +opt: str = 'sgd' -weight_decay: float = 0.00004 -gradient_accumulation_steps: int = 1 +# initial learning rate +lr: float = 0.1 + +# momentum momentum: float = 0.9 -max_steps: int = 5005 * 300 # 300 epoch +# weight decay +weight_decay: float = 1e-4 + +# weight decay for Normalization layers (default: None, same value as weight_decay) +norm_weight_decay: float = None + +# weight decay for bias parameters of all layers (default: None, same value as weight_decay) +bias_weight_decay: float = None + +# weight decay for embedding parameters for vision transformer models (default: None, same value as weight_decay) +transformer_embedding_decay: float = None + +# label smoothing +label_smoothing: float = 0.0 + +# mixup alpha +mixup_alpha: float = 0.0 + +# cutmix alpha +cutmix_alpha: float = 0.0 + +# the lr scheduler +lr_scheduler: str = "steplr" + +# the number of epochs to warmup +lr_warmup_epochs: int = 0 + +# the warmup method +lr_warmup_method: str = "constant" + +# the decay for lr +lr_warmup_decay: float = 0.01 + +# decrease lr every step-size epochs +lr_step_size: int = 30 + +# decrease lr by a factor of lr_gamma +lr_gamma: float = 0.1 + +# minimum lr of lr schedule +lr_min: float = 0.0 + +# Use sync batch norm +sync_bn: bool = False + +# auto augment policy +auto_augment: str = None + +# magnitude of auto augment policy +ra_magnitude: int = 9 + +# severity of augmix policy +augmix_severity: int = 3 + +# random erasing probability +random_erase: float = 0.0 + +# Use torch.cuda.amp for mixed precision training +amp: bool = False + +# the interpolation method +interpolation: str = "bilinear" + +# the resize size used for validation +val_resize_size: int = 256 + +# the central crop size used for validation +val_crop_size: int = 224 + +# the random crop size used for training +train_crop_size: int = 224 + +# the maximum gradient norm +clip_grad_norm: float = None + +# whether to use Repeated Augmentation in trainin +ra_sampler: bool = False + +# number of repetitions for Repeated Augmentation +ra_reps: int = 3 + seed: int = 41 -# Stop training after reaching this accuracy -target_acc1: float = 70.634 +# Stop training after reaching this accuracy TODO +target_acc1: float = 84.228 # Sample to begin performing eval. eval_iter_start_samples: int = 100 -# If set to -1, disable eval, else evaluate every eval_iter_samples during training +# If set to -1, disable eval, else evaluate every eval_iter_samples during training TODO eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch -# Total number of training samples to run. -max_samples_termination: float = 5005 * 256 * 300 # 300 epoch - -# number workers for dataloader -num_workers: int = 16 +# Total number of training samples to run. TODO +max_samples_termination: float = 5005 * 256 * 600 # 600 epoch # local_rank for distributed training on gpus local_rank: int = 0 @@ -57,7 +139,7 @@ use_env: bool = True # Number of epochs to plan seeds for. Same set across all workers. -num_epochs_to_generate_seeds_for: int = 2 +num_epochs_to_generate_seeds_for: int = 600 # frequency of logging loss. If not positive, no logging is provided for training loss log_freq: int = 10 @@ -66,10 +148,7 @@ # If set, precedes init_checkpoint/init_tf_checkpoint resume_from_checkpoint: bool = False -# A object to provide some core components in training -#training_event: ClassVar[BaseTrainingEventInterface] = None - -#training_event_instance: BaseTrainingEventInterface = None +gradient_accumulation_steps = 1 # device device: str = None diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py index 79d06e120..c0bd33461 100644 --- a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -1,7 +1,6 @@ mutable_params = [ 'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size', - 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', - 'gradient_accumulation_steps', 'max_samples_termination', "vendor" + 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', "vendor" ] mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"] diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py index 0e9edf515..40783d56c 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py @@ -2,13 +2,13 @@ import os import sys -import random -import numpy as np import torch -from torch.utils.data import Dataset -from torchvision import datasets, models, transforms -import torch.distributed as dist from torch.utils.data.dataloader import default_collate +from dataloaders import transforms, presets +from dataloaders.sampler import RASampler +from torchvision.transforms.functional import InterpolationMode +import torchvision + CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) @@ -16,34 +16,40 @@ def build_train_dataset(args): + dist_pytorch.main_proc_print('building train dataset ...') traindir = os.path.join(args.data_dir, args.train_data) - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - - train_dataset = datasets.ImageFolder( + interpolation = InterpolationMode(args.interpolation) + auto_augment_policy = getattr(args, "auto_augment", None) + random_erase_prob = getattr(args, "random_erase", 0.0) + ra_magnitude = getattr(args, "ra_magnitude", None) + augmix_severity = getattr(args, "augmix_severity", None) + train_dataset = torchvision.datasets.ImageFolder( traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) + presets.ClassificationPresetTrain( + crop_size=args.train_crop_size, + interpolation=interpolation, + auto_augment_policy=auto_augment_policy, + random_erase_prob=random_erase_prob, + ra_magnitude=ra_magnitude, + augmix_severity=augmix_severity, + ), + ) + return train_dataset def build_eval_dataset(args): + dist_pytorch.main_proc_print('building eval dataset ...') valdir = os.path.join(args.data_dir, args.eval_data) - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - - val_dataset = datasets.ImageFolder( - valdir, - transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) + interpolation = InterpolationMode(args.interpolation) + preprocessing = presets.ClassificationPresetEval( + crop_size=args.val_crop_size, resize_size=args.val_resize_size, interpolation=interpolation + ) + + val_dataset = torchvision.datasets.ImageFolder( + valdir, + preprocessing, + ) return val_dataset @@ -53,20 +59,36 @@ def build_train_dataloader(train_dataset, args): dist_pytorch.main_proc_print('building train dataloaders ...') if torch.distributed.is_available() and torch.distributed.is_initialized(): - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset) + if hasattr(args, "ra_sampler") and args.ra_sampler: + train_sampler = RASampler(train_dataset, shuffle=True, repetitions=args.ra_reps) + else: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) dist_pytorch.main_proc_print( f"use sampler: DistributedSampler, num_replicas:{args.n_device}") else: - train_sampler = None + train_sampler = torch.utils.data.RandomSampler(train_dataset) + + collate_fn = None + num_classes = len(train_dataset.classes) + mixup_transforms = [] + if args.mixup_alpha > 0.0: + mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha)) + if args.cutmix_alpha > 0.0: + mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha)) + if mixup_transforms: + mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) + + def collate_fn(batch): + return mixupcutmix(*default_collate(batch)) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, - shuffle=(train_sampler is None), + sampler=train_sampler, num_workers=args.num_workers, pin_memory=True, - sampler=train_sampler) + collate_fn=collate_fn, + ) dist_pytorch.main_proc_print( f'train samples:{len(train_dataset)}, batch size:{args.train_batch_size}' @@ -84,15 +106,11 @@ def build_eval_dataloader(eval_dataset, args): dist_pytorch.main_proc_print( f"use sampler: DistributedSampler, num_replicas:{args.n_device}") else: - val_sampler = None - + val_sampler = torch.utils.data.SequentialSampler(eval_dataset) + eval_dataloader = torch.utils.data.DataLoader( - eval_dataset, - batch_size=args.eval_batch_size, - shuffle=False, - num_workers=args.num_workers, - pin_memory=True, - sampler=val_sampler) + eval_dataset, batch_size=args.eval_batch_size, sampler=val_sampler, num_workers=args.num_workers, pin_memory=True + ) dist_pytorch.main_proc_print( f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}') diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py b/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py new file mode 100644 index 000000000..5d1bf1cc7 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py @@ -0,0 +1,71 @@ +import torch +from torchvision.transforms import autoaugment, transforms +from torchvision.transforms.functional import InterpolationMode + + +class ClassificationPresetTrain: + def __init__( + self, + *, + crop_size, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + interpolation=InterpolationMode.BILINEAR, + hflip_prob=0.5, + auto_augment_policy=None, + ra_magnitude=9, + augmix_severity=3, + random_erase_prob=0.0, + ): + trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + if hflip_prob > 0: + trans.append(transforms.RandomHorizontalFlip(hflip_prob)) + if auto_augment_policy is not None: + if auto_augment_policy == "ra": + trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) + elif auto_augment_policy == "ta_wide": + trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) + elif auto_augment_policy == "augmix": + trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity)) + else: + aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy) + trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation)) + trans.extend( + [ + transforms.PILToTensor(), + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=mean, std=std), + ] + ) + if random_erase_prob > 0: + trans.append(transforms.RandomErasing(p=random_erase_prob)) + + self.transforms = transforms.Compose(trans) + + def __call__(self, img): + return self.transforms(img) + + +class ClassificationPresetEval: + def __init__( + self, + *, + crop_size, + resize_size=256, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + interpolation=InterpolationMode.BILINEAR, + ): + + self.transforms = transforms.Compose( + [ + transforms.Resize(resize_size, interpolation=interpolation), + transforms.CenterCrop(crop_size), + transforms.PILToTensor(), + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=mean, std=std), + ] + ) + + def __call__(self, img): + return self.transforms(img) diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py b/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py new file mode 100644 index 000000000..e9dc1735a --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py @@ -0,0 +1,62 @@ +import math + +import torch +import torch.distributed as dist + + +class RASampler(torch.utils.data.Sampler): + """Sampler that restricts data loading to a subset of the dataset for distributed, + with repeated augmentation. + It ensures that different each augmented version of a sample will be visible to a + different process (GPU). + Heavily based on 'torch.utils.data.DistributedSampler'. + + This is borrowed from the DeiT Repo: + https://github.com/facebookresearch/deit/blob/main/samplers.py + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, repetitions=3): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available!") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available!") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * float(repetitions) / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) + self.shuffle = shuffle + self.seed = seed + self.repetitions = repetitions + + def __iter__(self): + if self.shuffle: + # Deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = list(range(len(self.dataset))) + + # Add extra samples to make it evenly divisible + indices = [ele for ele in indices for i in range(self.repetitions)] + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # Subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices[: self.num_selected_samples]) + + def __len__(self): + return self.num_selected_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py b/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py new file mode 100644 index 000000000..9a8ef7877 --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py @@ -0,0 +1,183 @@ +import math +from typing import Tuple + +import torch +from torch import Tensor +from torchvision.transforms import functional as F + + +class RandomMixup(torch.nn.Module): + """Randomly apply Mixup to the provided batch and targets. + The class implements the data augmentations as described in the paper + `"mixup: Beyond Empirical Risk Minimization" `_. + + Args: + num_classes (int): number of classes used for one-hot encoding. + p (float): probability of the batch being transformed. Default value is 0.5. + alpha (float): hyperparameter of the Beta distribution used for mixup. + Default value is 1.0. + inplace (bool): boolean to make this transform inplace. Default set to False. + """ + + def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None: + super().__init__() + + if num_classes < 1: + raise ValueError( + f"Please provide a valid positive value for the num_classes. Got num_classes={num_classes}" + ) + + if alpha <= 0: + raise ValueError("Alpha param can't be zero.") + + self.num_classes = num_classes + self.p = p + self.alpha = alpha + self.inplace = inplace + + def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: + """ + Args: + batch (Tensor): Float tensor of size (B, C, H, W) + target (Tensor): Integer tensor of size (B, ) + + Returns: + Tensor: Randomly transformed batch. + """ + if batch.ndim != 4: + raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}") + if target.ndim != 1: + raise ValueError(f"Target ndim should be 1. Got {target.ndim}") + if not batch.is_floating_point(): + raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.") + if target.dtype != torch.int64: + raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}") + + if not self.inplace: + batch = batch.clone() + target = target.clone() + + if target.ndim == 1: + target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype) + + if torch.rand(1).item() >= self.p: + return batch, target + + # It's faster to roll the batch by one instead of shuffling it to create image pairs + batch_rolled = batch.roll(1, 0) + target_rolled = target.roll(1, 0) + + # Implemented as on mixup paper, page 3. + lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) + batch_rolled.mul_(1.0 - lambda_param) + batch.mul_(lambda_param).add_(batch_rolled) + + target_rolled.mul_(1.0 - lambda_param) + target.mul_(lambda_param).add_(target_rolled) + + return batch, target + + def __repr__(self) -> str: + s = ( + f"{self.__class__.__name__}(" + f"num_classes={self.num_classes}" + f", p={self.p}" + f", alpha={self.alpha}" + f", inplace={self.inplace}" + f")" + ) + return s + + +class RandomCutmix(torch.nn.Module): + """Randomly apply Cutmix to the provided batch and targets. + The class implements the data augmentations as described in the paper + `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features" + `_. + + Args: + num_classes (int): number of classes used for one-hot encoding. + p (float): probability of the batch being transformed. Default value is 0.5. + alpha (float): hyperparameter of the Beta distribution used for cutmix. + Default value is 1.0. + inplace (bool): boolean to make this transform inplace. Default set to False. + """ + + def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None: + super().__init__() + if num_classes < 1: + raise ValueError("Please provide a valid positive value for the num_classes.") + if alpha <= 0: + raise ValueError("Alpha param can't be zero.") + + self.num_classes = num_classes + self.p = p + self.alpha = alpha + self.inplace = inplace + + def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: + """ + Args: + batch (Tensor): Float tensor of size (B, C, H, W) + target (Tensor): Integer tensor of size (B, ) + + Returns: + Tensor: Randomly transformed batch. + """ + if batch.ndim != 4: + raise ValueError(f"Batch ndim should be 4. Got {batch.ndim}") + if target.ndim != 1: + raise ValueError(f"Target ndim should be 1. Got {target.ndim}") + if not batch.is_floating_point(): + raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.") + if target.dtype != torch.int64: + raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}") + + if not self.inplace: + batch = batch.clone() + target = target.clone() + + if target.ndim == 1: + target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype) + + if torch.rand(1).item() >= self.p: + return batch, target + + # It's faster to roll the batch by one instead of shuffling it to create image pairs + batch_rolled = batch.roll(1, 0) + target_rolled = target.roll(1, 0) + + # Implemented as on cutmix paper, page 12 (with minor corrections on typos). + lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) + _, H, W = F.get_dimensions(batch) + + r_x = torch.randint(W, (1,)) + r_y = torch.randint(H, (1,)) + + r = 0.5 * math.sqrt(1.0 - lambda_param) + r_w_half = int(r * W) + r_h_half = int(r * H) + + x1 = int(torch.clamp(r_x - r_w_half, min=0)) + y1 = int(torch.clamp(r_y - r_h_half, min=0)) + x2 = int(torch.clamp(r_x + r_w_half, max=W)) + y2 = int(torch.clamp(r_y + r_h_half, max=H)) + + batch[:, :, y1:y2, x1:x2] = batch_rolled[:, :, y1:y2, x1:x2] + lambda_param = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H)) + + target_rolled.mul_(1.0 - lambda_param) + target.mul_(lambda_param).add_(target_rolled) + + return batch, target + + def __repr__(self) -> str: + s = ( + f"{self.__class__.__name__}(" + f"num_classes={self.num_classes}" + f", p={self.p}" + f", alpha={self.alpha}" + f", inplace={self.inplace}" + f")" + ) + return s diff --git a/training/benchmarks/efficientnet/pytorch/model/__init__.py b/training/benchmarks/efficientnet/pytorch/model/__init__.py index 76acf3d3d..8b4f4e667 100644 --- a/training/benchmarks/efficientnet/pytorch/model/__init__.py +++ b/training/benchmarks/efficientnet/pytorch/model/__init__.py @@ -3,5 +3,5 @@ def create_model(config): - model = torchvision.models.mobilenet_v2() + model = torchvision.models.efficientnet_v2_s() return model diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py index ef89d98b3..e5afe6be8 100755 --- a/training/benchmarks/efficientnet/pytorch/run_pretraining.py +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -1,4 +1,4 @@ -"""Mobilenet V2 Pretraining""" +"""EfficientNet Pretraining""" import os import sys @@ -57,9 +57,8 @@ def main() -> Tuple[Any, Any]: dist_pytorch.barrier(config.vendor) init_evaluation_start = time.time() - training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate( - trainer) - + #training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate(trainer) + training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = 0.0, 0.0, 0.0 init_evaluation_end = time.time() init_evaluation_info = dict(eval_acc1=training_state.eval_acc1, eval_acc5=training_state.eval_acc5, @@ -78,12 +77,12 @@ def main() -> Tuple[Any, Any]: model_driver.event(Event.TRAIN_START) raw_train_start_time = logger.previous_log_time - epoch = -1 - while training_state.global_steps < config.max_steps and \ + epoch = 0 + while training_state.epoch < config.epochs and \ not training_state.end_training: - epoch += 1 training_state.epoch = epoch trainer.train_one_epoch(train_dataloader) + epoch += 1 model_driver.event(Event.TRAIN_END) raw_train_end_time = logger.previous_log_time diff --git a/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py index 5421acb42..aff4fce9f 100644 --- a/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py +++ b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py @@ -1,10 +1,40 @@ -from torch.optim.lr_scheduler import StepLR +import torch -def create_scheduler(optimizer, args): +def create_scheduler(args, optimizer): """Build the learning rate scheduler.""" - lr_scheduler = StepLR(optimizer, - step_size=args.lr_step_size, - gamma=args.lr_gamma) + args.lr_scheduler = args.lr_scheduler.lower() + if args.lr_scheduler == "steplr": + main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) + elif args.lr_scheduler == "cosineannealinglr": + main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min + ) + elif args.lr_scheduler == "exponentiallr": + main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma) + else: + raise RuntimeError( + f"Invalid lr scheduler '{args.lr_scheduler}'. Only StepLR, CosineAnnealingLR and ExponentialLR " + "are supported." + ) + + if args.lr_warmup_epochs > 0: + if args.lr_warmup_method == "linear": + warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs + ) + elif args.lr_warmup_method == "constant": + warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( + optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs + ) + else: + raise RuntimeError( + f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported." + ) + lr_scheduler = torch.optim.lr_scheduler.SequentialLR( + optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs] + ) + else: + lr_scheduler = main_lr_scheduler return lr_scheduler diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 5dc3f0417..619bf1f29 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -10,27 +10,28 @@ def __init__(self, args, dataloader): self.total_loss = 0.0 self.total_acc1 = 0.0 self.total_acc5 = 0.0 - self.total_batch = 0 + self.total_size = 0 - def __update(self, loss, acc1, acc5): - self.total_loss += loss - self.total_acc1 += acc1 - self.total_acc5 += acc5 - self.total_batch += 1 + def __update(self, loss, acc1, acc5, n): + self.total_loss += loss * n + self.total_acc1 += acc1 * n + self.total_acc5 += acc5 * n + self.total_size += n def evaluate(self, trainer): self.total_loss, self.total_acc1, self.total_acc5 = 0.0, 0.0, 0.0 - self.total_batch = 0 - with torch.no_grad(): + self.total_size = 0 + with torch.inference_mode(): + #with torch.no_grad(): for i, batch in enumerate(self.dataloader): batch = trainer.process_batch(batch, self.args.device) loss, acc1, acc5 = trainer.inference(batch) - self.__update(loss.item(), acc1.item(), acc5.item()) + self.__update(loss.item(), acc1.item(), acc5.item(), batch[0].shape[0]) if dist.is_available() and dist.is_initialized(): total = torch.tensor([ self.total_loss, self.total_acc1, self.total_acc5, - self.total_batch + self.total_size ], dtype=torch.float32, device=self.args.device) @@ -38,7 +39,7 @@ def evaluate(self, trainer): self.total_loss, self.total_acc1, self.total_acc5, self.total_batch = total.tolist( ) - loss = self.total_loss / self.total_batch - acc1 = self.total_acc1 / self.total_batch - acc5 = self.total_acc5 / self.total_batch + loss = self.total_loss / self.total_size + acc1 = self.total_acc1 / self.total_size + acc5 = self.total_acc5 / self.total_size return loss, acc1, acc5 diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index 3402c5b83..df03905b6 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -11,31 +11,13 @@ from train.evaluator import Evaluator from train.training_state import TrainingState +from train.utils import accuracy import config CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import Driver, Event, dist_pytorch - - -def accuracy(output, target, topk=(1, )): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - class Trainer: def __init__(self, driver: Driver, adapter, evaluator: Evaluator, @@ -53,17 +35,18 @@ def __init__(self, driver: Driver, adapter, evaluator: Evaluator, self.evaluator = evaluator self.lr_scheduler = None self.global_batch_size = None - self.overflow_buf = None def init(self): self.model = create_model(config) self.model = self._init_model(self.model, self.config, self.device) - self.model = self.adapter.convert_model(self.model) - self.model = self.adapter.model_to_fp16(self.model) - self.optimizer = self.adapter.create_optimizer(self.model, self.config) - self.model = self.adapter.model_to_ddp(self.model) - self.lr_scheduler = create_scheduler(self.optimizer, self.config) - self.grad_scaler = self.adapter.create_grad_scaler() + self.model = self.adapter.convert_model(self.config, self.model) + self.model = self.adapter.model_to_fp16(self.config, self.model) + self.optimizer = self.adapter.create_optimizer(self.config, self.model) + self.model = self.adapter.model_to_ddp(self.config, self.model) + + self.lr_scheduler = create_scheduler(self.config, self.optimizer) + self.grad_scaler = self.adapter.create_grad_scaler(self.config) + self.criterion = torch.nn.CrossEntropyLoss(label_smoothing=self.config.label_smoothing) def _init_model(self, model, args, device): checkpoint_name = config.init_checkpoint @@ -86,6 +69,8 @@ def train_one_epoch(self, dataloader): step_start_time = time.time() epoch_start_num_sample = state.num_trained_samples + if dist.is_available() and dist.is_initialized(): + dataloader.sampler.set_epoch(state.epoch) for batch_idx, batch in enumerate(dataloader): state.global_steps += 1 @@ -97,14 +82,12 @@ def train_one_epoch(self, dataloader): self.train_one_step(batch) other_state = dict() - if state.global_steps % self.config.gradient_accumulation_steps == 0: - step_end_time = time.time() - step_total_time = step_end_time - step_start_time - step_start_time = step_end_time - images_per_second = ( - dist_pytorch.global_batch_size(self.config) * - self.config.gradient_accumulation_steps) / step_total_time - other_state["img/s"] = images_per_second + + step_end_time = time.time() + step_total_time = step_end_time - step_start_time + step_start_time = step_end_time + images_per_second = dist_pytorch.global_batch_size(self.config) / step_total_time + other_state["img/s"] = images_per_second if hasattr(self.optimizer, 'loss_scaler'): loss_scale = self.optimizer.loss_scaler.loss_scale other_state['loss_scale'] = loss_scale @@ -146,7 +129,7 @@ def train_one_step(self, batch): state = self.training_state self.model.train() state.loss, state.acc1, state.acc5 = self.forward(batch) - self.adapter.backward(state.global_steps, state.loss, self.optimizer) + self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.grad_scaler) if dist.is_available() and dist.is_initialized(): total = torch.tensor([state.loss, state.acc1, state.acc5], dtype=torch.float32, @@ -184,8 +167,7 @@ def can_do_eval(self, state): def forward(self, batch): images, target = batch output = self.model(images) - criterion = torch.nn.CrossEntropyLoss() - loss = criterion(output, target) + loss = self.criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) return loss, acc1, acc5 diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 11e1ba5e7..501e9e25f 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -1,47 +1,82 @@ import torch import torch.distributed as dist from torch.optim import Optimizer -import config from torch import nn, Tensor from driver.dist_pytorch import main_proc_print from typing import Tuple from torch.nn.parallel import DistributedDataParallel as DDP +from train import utils - -def convert_model(model: nn.Module) -> nn.Module: +def convert_model(args, model: nn.Module) -> nn.Module: + if dist.is_available() and dist.is_initialized() and args.sync_bn: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model -def create_optimizer(model, args): - optimizer = torch.optim.SGD(model.parameters(), - lr=args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) +def create_optimizer(args, model): + custom_keys_weight_decay = [] + if args.bias_weight_decay is not None: + custom_keys_weight_decay.append(("bias", args.bias_weight_decay)) + if args.transformer_embedding_decay is not None: + for key in ["class_token", "position_embedding", "relative_position_bias_table"]: + custom_keys_weight_decay.append((key, args.transformer_embedding_decay)) + parameters = utils.set_weight_decay( + model, + args.weight_decay, + norm_weight_decay=args.norm_weight_decay, + custom_keys_weight_decay=custom_keys_weight_decay if len(custom_keys_weight_decay) > 0 else None, + ) + + opt_name = args.opt.lower() + if opt_name.startswith("sgd"): + optimizer = torch.optim.SGD( + parameters, + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + nesterov="nesterov" in opt_name, + ) + elif opt_name == "rmsprop": + optimizer = torch.optim.RMSprop( + parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=0.0316, alpha=0.9 + ) + elif opt_name == "adamw": + optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay) + else: + raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.") return optimizer -def model_to_fp16(model): +def model_to_fp16(args, model): # To prevent OOM for model sizes that cannot fit in GPU memory in full precision - if config.fp16: + if args.fp16: main_proc_print(" > use fp16...") model.half() return model -def model_to_ddp(model: nn.Module) -> nn.Module: +def model_to_ddp(args, model: nn.Module) -> nn.Module: if dist.is_available() and dist.is_initialized(): - model = DDP(model, device_ids=[config.local_rank]) + model = DDP(model, device_ids=[args.local_rank]) return model +def create_grad_scaler(args): + scaler = torch.cuda.amp.GradScaler() if args.amp else None + return scaler -def create_grad_scaler(): - return None - - -def backward(step: int, loss: torch.Tensor, optimizer: Optimizer): - loss.backward() - update_step = step % config.gradient_accumulation_steps == 0 - if update_step: +def backward(args, step: int, epoch: int, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, scaler): + optimizer.zero_grad() + if scaler is not None: + scaler.scale(loss).backward() + if args.clip_grad_norm is not None: + # we should unscale the gradients of optimizer's assigned params if do gradient clipping + scaler.unscale_(optimizer) + nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + if args.clip_grad_norm is not None: + nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() - optimizer.zero_grad() diff --git a/training/benchmarks/efficientnet/pytorch/train/utils.py b/training/benchmarks/efficientnet/pytorch/train/utils.py new file mode 100644 index 000000000..3e6c2e89e --- /dev/null +++ b/training/benchmarks/efficientnet/pytorch/train/utils.py @@ -0,0 +1,465 @@ +import copy +import datetime +import errno +import hashlib +import os +import time +from collections import defaultdict, deque, OrderedDict +from typing import List, Optional, Tuple + +import torch +import torch.distributed as dist + + +class SmoothedValue: + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + t = reduce_across_processes([self.count, self.total]) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value + ) + + +class MetricLogger: + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'") + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append(f"{name}: {str(meter)}") + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + space_fmt = ":" + str(len(str(len(iterable)))) + "d" + if torch.cuda.is_available(): + log_msg = self.delimiter.join( + [ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + "max mem: {memory:.0f}", + ] + ) + else: + log_msg = self.delimiter.join( + [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"] + ) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB, + ) + ) + else: + print( + log_msg.format( + i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time) + ) + ) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print(f"{header} Total time: {total_time_str}") + + +class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): + """Maintains moving averages of model parameters using an exponential decay. + ``ema_avg = decay * avg_model_param + (1 - decay) * model_param`` + `torch.optim.swa_utils.AveragedModel `_ + is used to compute the EMA. + """ + + def __init__(self, model, decay, device="cpu"): + def ema_avg(avg_model_param, model_param, num_averaged): + return decay * avg_model_param + (1 - decay) * model_param + + super().__init__(model, device, ema_avg, use_buffers=True) + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.inference_mode(): + maxk = max(topk) + batch_size = target.size(0) + if target.ndim == 2: + target = target.max(dim=1)[1] + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target[None]) + + res = [] + for k in topk: + correct_k = correct[:k].flatten().sum(dtype=torch.float32) + res.append(correct_k * (100.0 / batch_size)) + return res + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ["WORLD_SIZE"]) + args.gpu = int(os.environ["LOCAL_RANK"]) + elif "SLURM_PROCID" in os.environ: + args.rank = int(os.environ["SLURM_PROCID"]) + args.gpu = args.rank % torch.cuda.device_count() + elif hasattr(args, "rank"): + pass + else: + print("Not using distributed mode") + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = "nccl" + print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True) + torch.distributed.init_process_group( + backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank + ) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +def average_checkpoints(inputs): + """Loads checkpoints from inputs and returns a model with averaged weights. Original implementation taken from: + https://github.com/pytorch/fairseq/blob/a48f235636557b8d3bc4922a6fa90f3a0fa57955/scripts/average_checkpoints.py#L16 + + Args: + inputs (List[str]): An iterable of string paths of checkpoints to load from. + Returns: + A dict of string keys mapping to various values. The 'model' key + from the returned dict should correspond to an OrderedDict mapping + string parameter names to torch Tensors. + """ + params_dict = OrderedDict() + params_keys = None + new_state = None + num_models = len(inputs) + for fpath in inputs: + with open(fpath, "rb") as f: + state = torch.load( + f, + map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")), + ) + # Copies over the settings from the first checkpoint + if new_state is None: + new_state = state + model_params = state["model"] + model_params_keys = list(model_params.keys()) + if params_keys is None: + params_keys = model_params_keys + elif params_keys != model_params_keys: + raise KeyError( + f"For checkpoint {f}, expected list of params: {params_keys}, but found: {model_params_keys}" + ) + for k in params_keys: + p = model_params[k] + if isinstance(p, torch.HalfTensor): + p = p.float() + if k not in params_dict: + params_dict[k] = p.clone() + # NOTE: clone() is needed in case of p is a shared parameter + else: + params_dict[k] += p + averaged_params = OrderedDict() + for k, v in params_dict.items(): + averaged_params[k] = v + if averaged_params[k].is_floating_point(): + averaged_params[k].div_(num_models) + else: + averaged_params[k] //= num_models + new_state["model"] = averaged_params + return new_state + + +def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=True): + """ + This method can be used to prepare weights files for new models. It receives as + input a model architecture and a checkpoint from the training script and produces + a file with the weights ready for release. + + Examples: + from torchvision import models as M + + # Classification + model = M.mobilenet_v3_large(weights=None) + print(store_model_weights(model, './class.pth')) + + # Quantized Classification + model = M.quantization.mobilenet_v3_large(weights=None, quantize=False) + model.fuse_model(is_qat=True) + model.qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack') + _ = torch.ao.quantization.prepare_qat(model, inplace=True) + print(store_model_weights(model, './qat.pth')) + + # Object Detection + model = M.detection.fasterrcnn_mobilenet_v3_large_fpn(weights=None, weights_backbone=None) + print(store_model_weights(model, './obj.pth')) + + # Segmentation + model = M.segmentation.deeplabv3_mobilenet_v3_large(weights=None, weights_backbone=None, aux_loss=True) + print(store_model_weights(model, './segm.pth', strict=False)) + + Args: + model (pytorch.nn.Module): The model on which the weights will be loaded for validation purposes. + checkpoint_path (str): The path of the checkpoint we will load. + checkpoint_key (str, optional): The key of the checkpoint where the model weights are stored. + Default: "model". + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` + + Returns: + output_path (str): The location where the weights are saved. + """ + # Store the new model next to the checkpoint_path + checkpoint_path = os.path.abspath(checkpoint_path) + output_dir = os.path.dirname(checkpoint_path) + + # Deep copy to avoid side effects on the model object. + model = copy.deepcopy(model) + checkpoint = torch.load(checkpoint_path, map_location="cpu") + + # Load the weights to the model to validate that everything works + # and remove unnecessary weights (such as auxiliaries, etc.) + if checkpoint_key == "model_ema": + del checkpoint[checkpoint_key]["n_averaged"] + torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.") + model.load_state_dict(checkpoint[checkpoint_key], strict=strict) + + tmp_path = os.path.join(output_dir, str(model.__hash__())) + torch.save(model.state_dict(), tmp_path) + + sha256_hash = hashlib.sha256() + with open(tmp_path, "rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + hh = sha256_hash.hexdigest() + + output_path = os.path.join(output_dir, "weights-" + str(hh[:8]) + ".pth") + os.replace(tmp_path, output_path) + + return output_path + + +def reduce_across_processes(val): + if not is_dist_avail_and_initialized(): + # nothing to sync, but we still convert to tensor for consistency with the distributed case. + return torch.tensor(val) + + t = torch.tensor(val, device="cuda") + dist.barrier() + dist.all_reduce(t) + return t + + +def set_weight_decay( + model: torch.nn.Module, + weight_decay: float, + norm_weight_decay: Optional[float] = None, + norm_classes: Optional[List[type]] = None, + custom_keys_weight_decay: Optional[List[Tuple[str, float]]] = None, +): + if not norm_classes: + norm_classes = [ + torch.nn.modules.batchnorm._BatchNorm, + torch.nn.LayerNorm, + torch.nn.GroupNorm, + torch.nn.modules.instancenorm._InstanceNorm, + torch.nn.LocalResponseNorm, + ] + norm_classes = tuple(norm_classes) + + params = { + "other": [], + "norm": [], + } + params_weight_decay = { + "other": weight_decay, + "norm": norm_weight_decay, + } + custom_keys = [] + if custom_keys_weight_decay is not None: + for key, weight_decay in custom_keys_weight_decay: + params[key] = [] + params_weight_decay[key] = weight_decay + custom_keys.append(key) + + def _add_params(module, prefix=""): + for name, p in module.named_parameters(recurse=False): + if not p.requires_grad: + continue + is_custom_key = False + for key in custom_keys: + target_name = f"{prefix}.{name}" if prefix != "" and "." in key else name + if key == target_name: + params[key].append(p) + is_custom_key = True + break + if not is_custom_key: + if norm_weight_decay is not None and isinstance(module, norm_classes): + params["norm"].append(p) + else: + params["other"].append(p) + + for child_name, child_module in module.named_children(): + child_prefix = f"{prefix}.{child_name}" if prefix != "" else child_name + _add_params(child_module, prefix=child_prefix) + + _add_params(model) + + param_groups = [] + for key in params: + if len(params[key]) > 0: + param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]}) + return param_groups diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py index 461017de5..fbaea0e5d 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py @@ -1,11 +1,4 @@ from config_common import * -train_batch_size = 256 -eval_batch_size = 128 - -lr = 0.1 -gradient_accumulation_steps = 1 -warmup = 0.1 -lr_decay_ratio = 0.1 -lr_decay_iters = 4338 -log_freq = 10 +train_batch_size = 64 +eval_batch_size = 64 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py index 6fb81b718..fb4a39e51 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py @@ -1,6 +1,6 @@ from config_common import * -train_batch_size = 32 -eval_batch_size = 16 +train_batch_size = 128 +eval_batch_size = 128 gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_common.py b/training/kunlunxin/efficientnet-pytorch/config/config_common.py index 012668a0a..6ee3fc54c 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_common.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_common.py @@ -1,3 +1,26 @@ vendor = "kunlunxin" dist_backend = "xccl" -fp16 = False \ No newline at end of file +fp16 = False + +lr = 0.5 +lr_scheduler = "cosineannealinglr" +lr_warmup_epochs = 5 +lr_warmup_method = "linear" +auto_augment = "ta_wide" +random_erase = 0.1 +label_smoothing = 0.1 +mixup_alpha = 0.2 +cutmix_alpha = 1.0 +weight_decay = 0.00002 +norm_weight_decay = 0.0 +model_ema = True +ra_sampler = True +ra_reps = 4 +epochs = 600 + +# efficientnet_v2_s +TRAIN_SIZE = 300 +train_crop_size = TRAIN_SIZE +EVAL_SIZE = 384 +val_crop_size = EVAL_SIZE +val_resize_size = EVAL_SIZE \ No newline at end of file diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py index 96fd7c783..4b1aede50 100644 --- a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py @@ -1,53 +1,15 @@ -import os -from port_for import is_available -import torch -import torch.distributed as dist -from torch.optim import Optimizer -from torch.optim.lr_scheduler import _LRScheduler -import config - -from torch import nn, Tensor +from torch import nn from driver.dist_pytorch import main_proc_print -from typing import Tuple - -from torch_xmlir.optimizer import SGD -import torch_xmlir.core.xpu_model as xm -def convert_model(model: nn.Module) -> nn.Module: +def convert_model(args, model: nn.Module) -> nn.Module: return model - -def create_optimizer(model, args): - optimizer = SGD(model.parameters(), - lr=args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) - return optimizer - - -def model_to_fp16(model): +def model_to_fp16(args, model): # To prevent OOM for model sizes that cannot fit in GPU memory in full precision - if config.fp16: + if args.fp16: main_proc_print(" > use fp16...") model.half() return model - - -def model_to_ddp(model: nn.Module) -> nn.Module: - if dist.is_available() and dist.is_initialized(): - from torch.nn.parallel import DistributedDataParallel as DDP - model = DDP(model) - return model - - -def create_grad_scaler(): +def create_grad_scaler(args): return None - - -def backward(step: int, loss: torch.Tensor, optimizer: Optimizer): - loss.backward() - update_step = step % config.gradient_accumulation_steps == 0 - if update_step: - optimizer.step() - optimizer.zero_grad() diff --git a/training/nvidia/efficientnet-pytorch/config/config_common.py b/training/nvidia/efficientnet-pytorch/config/config_common.py index af56e1cb3..b8e579625 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_common.py +++ b/training/nvidia/efficientnet-pytorch/config/config_common.py @@ -1,3 +1,26 @@ vendor = "nvidia" dist_backend = "nccl" -fp16 = False \ No newline at end of file +fp16 = False + +lr = 0.5 +lr_scheduler = "cosineannealinglr" +lr_warmup_epochs = 5 +lr_warmup_method = "linear" +auto_augment = "ta_wide" +random_erase = 0.1 +label_smoothing = 0.1 +mixup_alpha = 0.2 +cutmix_alpha = 1.0 +weight_decay = 0.00002 +norm_weight_decay = 0.0 +model_ema = True +ra_sampler = True +ra_reps = 4 +epochs = 600 + +# efficientnet_v2_s +TRAIN_SIZE = 300 +train_crop_size = TRAIN_SIZE +EVAL_SIZE = 384 +val_crop_size = EVAL_SIZE +val_resize_size = EVAL_SIZE \ No newline at end of file From acfde4178f6f97edcd4b7c65bc9e03638349773a Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 07:37:39 +0000 Subject: [PATCH 03/24] modify config --- .../nvidia/efficientnet-pytorch/config/config_A100x1x1.py | 2 +- .../nvidia/efficientnet-pytorch/config/config_A100x1x8.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py index d6f1e735e..0b0a0006f 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py @@ -1,4 +1,4 @@ from config_common import * -train_batch_size = 256 +train_batch_size = 128 eval_batch_size = 128 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py index 6fb81b718..fb4a39e51 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py @@ -1,6 +1,6 @@ from config_common import * -train_batch_size = 32 -eval_batch_size = 16 +train_batch_size = 128 +eval_batch_size = 128 gradient_accumulation_steps = 1 From b4e96274a9d33083b73d94d46ec37d60980ee651 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 07:42:54 +0000 Subject: [PATCH 04/24] modify config --- training/nvidia/efficientnet-pytorch/config/config_common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/nvidia/efficientnet-pytorch/config/config_common.py b/training/nvidia/efficientnet-pytorch/config/config_common.py index b8e579625..8f0cda1b6 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_common.py +++ b/training/nvidia/efficientnet-pytorch/config/config_common.py @@ -17,10 +17,11 @@ ra_sampler = True ra_reps = 4 epochs = 600 +num_workers = 8 # efficientnet_v2_s TRAIN_SIZE = 300 train_crop_size = TRAIN_SIZE EVAL_SIZE = 384 val_crop_size = EVAL_SIZE -val_resize_size = EVAL_SIZE \ No newline at end of file +val_resize_size = EVAL_SIZE From c6fbea341002ade97c012e3b1f8045ddf418c1e5 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 08:42:00 +0000 Subject: [PATCH 05/24] modify config --- training/benchmarks/efficientnet/pytorch/train/evaluator.py | 2 +- .../nvidia/efficientnet-pytorch/config/config_A100x1x4.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 619bf1f29..1d8bb02e4 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -36,7 +36,7 @@ def evaluate(self, trainer): dtype=torch.float32, device=self.args.device) dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) - self.total_loss, self.total_acc1, self.total_acc5, self.total_batch = total.tolist( + self.total_loss, self.total_acc1, self.total_acc5, self.total_size = total.tolist( ) loss = self.total_loss / self.total_size diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py index 0b08a1cf4..fb4a39e51 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py @@ -1,6 +1,6 @@ from config_common import * -train_batch_size = 64 -eval_batch_size = 32 +train_batch_size = 128 +eval_batch_size = 128 gradient_accumulation_steps = 1 From fce71f231c4234166b071a1e011bc3d068dfa7f3 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 11:01:44 +0000 Subject: [PATCH 06/24] add efficientnet --- .../efficientnet/pytorch/train/trainer.py | 57 ++++++++++++++----- .../pytorch/train/training_state.py | 16 ++++++ 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index df03905b6..8713fecc0 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -11,7 +11,7 @@ from train.evaluator import Evaluator from train.training_state import TrainingState -from train.utils import accuracy +from train import utils import config @@ -26,7 +26,7 @@ def __init__(self, driver: Driver, adapter, evaluator: Evaluator, self.driver = driver self.adapter = adapter self.training_state = training_state - self.grad_scaler = None + self.scaler = None self.device = device self.optimizer = None @@ -45,21 +45,37 @@ def init(self): self.model = self.adapter.model_to_ddp(self.config, self.model) self.lr_scheduler = create_scheduler(self.config, self.optimizer) - self.grad_scaler = self.adapter.create_grad_scaler(self.config) + self.scaler = self.adapter.create_grad_scaler(self.config) self.criterion = torch.nn.CrossEntropyLoss(label_smoothing=self.config.label_smoothing) + self.resume() + def _init_model(self, model, args, device): - checkpoint_name = config.init_checkpoint - if os.path.isfile(checkpoint_name): - print('checkpoint_name', checkpoint_name) - print('global rank {} is loading pretrained model {}'.format( - dist_pytorch.get_rank(), checkpoint_name)) - # Load the checkpoint. - checkpoint = torch.load(checkpoint_name, map_location='cpu') - model.load_state_dict(checkpoint['state_dict']) + # checkpoint_name = config.init_checkpoint + # if os.path.isfile(checkpoint_name): + # print('checkpoint_name', checkpoint_name) + # print('global rank {} is loading pretrained model {}'.format( + # dist_pytorch.get_rank(), checkpoint_name)) + # # Load the checkpoint. + # checkpoint = torch.load(checkpoint_name, map_location='cpu') + # model.load_state_dict(checkpoint['state_dict']) model = model.to(device) return model + + def resume(self): + args = self.config + if args.resume and os.path.isfile(args.resume): + print('global rank {} is loading checkpoint {}'.format( + dist_pytorch.get_rank(), args.resume)) + checkpoint = torch.load(args.resume, map_location="cpu") + self.model.load_state_dict(checkpoint["model"]) + self.optimizer.load_state_dict(checkpoint["optimizer"]) + self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) + self.training_state.load_state_dict(checkpoint["training_state"]) + self.training_state.epoch += 1 + if self.scaler: + self.scaler.load_state_dict(checkpoint["scaler"]) def train_one_epoch(self, dataloader): state = self.training_state @@ -121,6 +137,19 @@ def train_one_epoch(self, dataloader): state.num_trained_samples = epoch_start_num_sample self.lr_scheduler.step() + if self.config.output_dir: + checkpoint = { + "model": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + "lr_scheduler": self.lr_scheduler.state_dict(), + "training_state": self.training_state.state_dict(), + #"epoch": epoch, + #"args": args, + } + if self.scaler: + checkpoint["scaler"] = self.scaler.state_dict() + utils.save_on_master(checkpoint, os.path.join(self.config.output_dir, f"model_{self.training_state.epoch}.pth")) + utils.save_on_master(checkpoint, os.path.join(self.config.output_dir, "checkpoint.pth")) driver.event(Event.EPOCH_END, state.epoch) def train_one_step(self, batch): @@ -129,7 +158,7 @@ def train_one_step(self, batch): state = self.training_state self.model.train() state.loss, state.acc1, state.acc5 = self.forward(batch) - self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.grad_scaler) + self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.scaler) if dist.is_available() and dist.is_initialized(): total = torch.tensor([state.loss, state.acc1, state.acc5], dtype=torch.float32, @@ -138,7 +167,7 @@ def train_one_step(self, batch): total = total / dist.get_world_size() state.loss, state.acc1, state.acc5 = total.tolist() self.driver.event(Event.BACKWARD, state.global_steps, state.loss, - self.optimizer, self.grad_scaler) + self.optimizer, self.scaler) def detect_training_status(self, state): config = self.config @@ -168,7 +197,7 @@ def forward(self, batch): images, target = batch output = self.model(images) loss = self.criterion(output, target) - acc1, acc5 = accuracy(output, target, topk=(1, 5)) + acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) return loss, acc1, acc5 def inference(self, batch): diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py index 2e5a1fca8..3e7391175 100644 --- a/training/benchmarks/efficientnet/pytorch/train/training_state.py +++ b/training/benchmarks/efficientnet/pytorch/train/training_state.py @@ -72,3 +72,19 @@ def to_dict(self, **kwargs): state_dict[k] = state_dict[k].item() return state_dict + + def state_dict(self): + """Returns the state of the scheduler as a :class:`dict`. + + It contains an entry for every variable in self.__dict__ + """ + return {key: value for key, value in self.__dict__.items()} + + def load_state_dict(self, state_dict): + """Loads the schedulers state. + + Args: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) From ef390bc5b1c4e76cb82b7cca30f30b993c54ba50 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Wed, 24 May 2023 11:15:49 +0000 Subject: [PATCH 07/24] modify config --- training/benchmarks/efficientnet/pytorch/config/_base.py | 1 + .../benchmarks/efficientnet/pytorch/config/mutable_params.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py index 9319137f1..039a25cee 100644 --- a/training/benchmarks/efficientnet/pytorch/config/_base.py +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -14,6 +14,7 @@ eval_data: str = "val" output_dir: str = "" init_checkpoint: str = "" +resume: str = "" # ========================================================= # train && evaluate diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py index c0bd33461..356085b31 100644 --- a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -3,4 +3,4 @@ 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', "vendor" ] -mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"] +mutable_params += ["local_rank", "do_train", "data_dir", "log_freq", "output_dir"] From 51847e1c974bc0202a17ec3f8238b72e9e464511 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Thu, 25 May 2023 06:23:45 +0000 Subject: [PATCH 08/24] add efficientnet --- training/benchmarks/efficientnet/pytorch/run_pretraining.py | 3 +-- training/benchmarks/efficientnet/pytorch/train/evaluator.py | 3 +-- training/benchmarks/efficientnet/pytorch/train/utils.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py index e5afe6be8..a32f2dc54 100755 --- a/training/benchmarks/efficientnet/pytorch/run_pretraining.py +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -57,8 +57,7 @@ def main() -> Tuple[Any, Any]: dist_pytorch.barrier(config.vendor) init_evaluation_start = time.time() - #training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate(trainer) - training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = 0.0, 0.0, 0.0 + training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate(trainer) init_evaluation_end = time.time() init_evaluation_info = dict(eval_acc1=training_state.eval_acc1, eval_acc5=training_state.eval_acc5, diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 1d8bb02e4..94bb3e401 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -21,8 +21,7 @@ def __update(self, loss, acc1, acc5, n): def evaluate(self, trainer): self.total_loss, self.total_acc1, self.total_acc5 = 0.0, 0.0, 0.0 self.total_size = 0 - with torch.inference_mode(): - #with torch.no_grad(): + with torch.no_grad(): for i, batch in enumerate(self.dataloader): batch = trainer.process_batch(batch, self.args.device) loss, acc1, acc5 = trainer.inference(batch) diff --git a/training/benchmarks/efficientnet/pytorch/train/utils.py b/training/benchmarks/efficientnet/pytorch/train/utils.py index 3e6c2e89e..d12f66f3f 100644 --- a/training/benchmarks/efficientnet/pytorch/train/utils.py +++ b/training/benchmarks/efficientnet/pytorch/train/utils.py @@ -172,7 +172,7 @@ def ema_avg(avg_model_param, model_param, num_averaged): def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.inference_mode(): + with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) if target.ndim == 2: @@ -188,7 +188,6 @@ def accuracy(output, target, topk=(1,)): res.append(correct_k * (100.0 / batch_size)) return res - def mkdir(path): try: os.makedirs(path) From 3f904db6a2cd12b0abb4eaa2af85b6c3e7ff57ad Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Thu, 25 May 2023 07:45:49 +0000 Subject: [PATCH 09/24] bug fix --- .../benchmarks/efficientnet/pytorch/config/mutable_params.py | 2 +- training/benchmarks/efficientnet/pytorch/run_pretraining.py | 4 +--- .../benchmarks/efficientnet/pytorch/train/training_state.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py index 356085b31..e842767a4 100644 --- a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -3,4 +3,4 @@ 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', "vendor" ] -mutable_params += ["local_rank", "do_train", "data_dir", "log_freq", "output_dir"] +mutable_params += ["local_rank", "do_train", "data_dir", "log_freq", "output_dir", "resume"] diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py index a32f2dc54..c097442b2 100755 --- a/training/benchmarks/efficientnet/pytorch/run_pretraining.py +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -76,12 +76,10 @@ def main() -> Tuple[Any, Any]: model_driver.event(Event.TRAIN_START) raw_train_start_time = logger.previous_log_time - epoch = 0 while training_state.epoch < config.epochs and \ not training_state.end_training: - training_state.epoch = epoch trainer.train_one_epoch(train_dataloader) - epoch += 1 + training_state.epoch += epoch model_driver.event(Event.TRAIN_END) raw_train_end_time = logger.previous_log_time diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py index 3e7391175..93e8c94f9 100644 --- a/training/benchmarks/efficientnet/pytorch/train/training_state.py +++ b/training/benchmarks/efficientnet/pytorch/train/training_state.py @@ -20,7 +20,7 @@ class TrainingState: eval_acc1: float = 0.0 eval_acc5: float = 0.0 - epoch: int = 1 + epoch: int = 0 num_trained_samples = 0 end_training: bool = False converged: bool = False @@ -78,7 +78,7 @@ def state_dict(self): It contains an entry for every variable in self.__dict__ """ - return {key: value for key, value in self.__dict__.items()} + return {key: value for key, value in self.__dict__.items() if key != '_trainer'} def load_state_dict(self, state_dict): """Loads the schedulers state. From 48e835d8db93cab40eba5ab134e707e54e7a5291 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Thu, 25 May 2023 08:12:00 +0000 Subject: [PATCH 10/24] add efficientnet --- .../benchmarks/efficientnet/pytorch/config/_base.py | 8 ++++---- .../benchmarks/efficientnet/pytorch/train/trainer.py | 12 ------------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py index 039a25cee..edf327f09 100644 --- a/training/benchmarks/efficientnet/pytorch/config/_base.py +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -128,11 +128,11 @@ # Sample to begin performing eval. eval_iter_start_samples: int = 100 -# If set to -1, disable eval, else evaluate every eval_iter_samples during training TODO -eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch +# If set to -1, disable eval, else evaluate every eval_iter_samples during training +eval_interval_samples: int = 1281167 # 1 epoch -# Total number of training samples to run. TODO -max_samples_termination: float = 5005 * 256 * 600 # 600 epoch +# Total number of training samples to run. +max_samples_termination: float = 1281167 * 600 # 600 epoch # local_rank for distributed training on gpus local_rank: int = 0 diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index 8713fecc0..5330bae3f 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -47,19 +47,9 @@ def init(self): self.lr_scheduler = create_scheduler(self.config, self.optimizer) self.scaler = self.adapter.create_grad_scaler(self.config) self.criterion = torch.nn.CrossEntropyLoss(label_smoothing=self.config.label_smoothing) - self.resume() def _init_model(self, model, args, device): - # checkpoint_name = config.init_checkpoint - # if os.path.isfile(checkpoint_name): - # print('checkpoint_name', checkpoint_name) - # print('global rank {} is loading pretrained model {}'.format( - # dist_pytorch.get_rank(), checkpoint_name)) - # # Load the checkpoint. - # checkpoint = torch.load(checkpoint_name, map_location='cpu') - # model.load_state_dict(checkpoint['state_dict']) - model = model.to(device) return model @@ -143,8 +133,6 @@ def train_one_epoch(self, dataloader): "optimizer": self.optimizer.state_dict(), "lr_scheduler": self.lr_scheduler.state_dict(), "training_state": self.training_state.state_dict(), - #"epoch": epoch, - #"args": args, } if self.scaler: checkpoint["scaler"] = self.scaler.state_dict() From 37d78be5c4e25957c50485e660041165362760e6 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Thu, 25 May 2023 08:17:28 +0000 Subject: [PATCH 11/24] add efficientnet --- training/benchmarks/efficientnet/README.md | 6 +++--- training/kunlunxin/efficientnet-pytorch/README.md | 6 +++--- training/nvidia/efficientnet-pytorch/README.md | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/training/benchmarks/efficientnet/README.md b/training/benchmarks/efficientnet/README.md index ce707c0e4..38071be69 100644 --- a/training/benchmarks/efficientnet/README.md +++ b/training/benchmarks/efficientnet/README.md @@ -1,10 +1,10 @@ ### 模型信息 - 模型介绍 ->MobileNet-v2 is a convolutional neural network that is 53 layers deep. You can load a pretrained version of the network trained on more than a million images from the ImageNet database. The pretrained network can classify images into 1000 object categories, such as keyboard, mouse, pencil, and many animals. ->Refer to Sandler, M., Howard, A., Zhu, M., Zhmoginov, A. and Chen, L.C. "MobileNetV2: Inverted Residuals and Linear Bottlenecks." In 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 4510-4520). IEEE. +>EfficientNet is a convolutional neural network architecture and scaling method that uniformly scales all dimensions of depth/width/resolution using a compound coefficient. Unlike conventional practice that arbitrary scales these factors, the EfficientNet scaling method uniformly scales network width, depth, and resolution with a set of fixed scaling coefficients. +>Refer to EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks - 模型代码来源 -> https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py +> https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py ### 数据集 - 数据集下载地址 diff --git a/training/kunlunxin/efficientnet-pytorch/README.md b/training/kunlunxin/efficientnet-pytorch/README.md index a547c7d3a..1e0a9e617 100644 --- a/training/kunlunxin/efficientnet-pytorch/README.md +++ b/training/kunlunxin/efficientnet-pytorch/README.md @@ -1,7 +1,7 @@ ### 模型Checkpoint下载 -[模型Checkpoint下载](../../benchmarks/mobilenetv2/README.md#模型checkpoint) +[模型Checkpoint下载](../../benchmarks/efficientnet/README.md#模型checkpoint) ### 测试数据集下载 -[测试数据集下载](../../benchmarks/mobilenetv2/README.md#数据集) +[测试数据集下载](../../benchmarks/efficientnet/README.md#数据集) ### 昆仑芯XPU配置与运行信息参考 #### 环境配置 @@ -25,7 +25,7 @@ | 单机1卡 | config_R300x1x1 | | | | | | | 单机2卡 | config_R300x1x2 | | | | | | | 单机4卡 | config_R300x1x4 | | | | | | -| 单机8卡 | config_R300x1x8 | | 70.634 | 69.549 | 1501500 | | +| 单机8卡 | config_R300x1x8 | | | | | | | 两机8卡 | config_R300x2x8 | | | | | | ### 许可证 diff --git a/training/nvidia/efficientnet-pytorch/README.md b/training/nvidia/efficientnet-pytorch/README.md index af9acab76..44c958efa 100644 --- a/training/nvidia/efficientnet-pytorch/README.md +++ b/training/nvidia/efficientnet-pytorch/README.md @@ -1,5 +1,5 @@ ### 测试数据集下载 -[测试数据集下载](../../benchmarks/mobilenetv2/README.md#数据集) +[测试数据集下载](../../benchmarks/efficientnet/README.md#数据集) ### Nvidia GPU配置与运行信息参考 #### 环境配置 @@ -21,7 +21,7 @@ | 单机1卡 | config_A100x1x1 | | | | | | | 单机2卡 | config_A100x1x2 | | | | | | | 单机4卡 | config_A100x1x4 | | | | | | -| 单机8卡 | config_A100x1x8 | 94208.62 | 70.634 | 70.634 | 1501500 | 4081.72 | +| 单机8卡 | config_A100x1x8 | | | | | | | 两机8卡 | config_A100x2x8 | | | | | | ### 许可证 From 98361a5aaf7330108f6abe5a33fb8aed5bafaf2f Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Fri, 26 May 2023 09:35:39 +0000 Subject: [PATCH 12/24] fix code style --- training/benchmarks/efficientnet/README.md | 11 +++++++---- .../benchmarks/efficientnet/pytorch/config/_base.py | 2 +- .../efficientnet/pytorch/config/mutable_params.py | 11 ++++++++--- .../efficientnet/pytorch/dataloaders/dataloader.py | 9 +++++---- .../efficientnet/pytorch/train/evaluator.py | 4 ++-- .../benchmarks/efficientnet/pytorch/train/trainer.py | 4 ++-- .../efficientnet/pytorch/train/trainer_adapter.py | 2 +- .../efficientnet/pytorch/train/training_state.py | 1 - .../efficientnet-pytorch/config/config_R300x1x1.py | 4 ++-- .../efficientnet-pytorch/config/config_R300x1x2.py | 4 +--- .../efficientnet-pytorch/config/config_R300x1x4.py | 6 ++---- .../efficientnet-pytorch/config/config_R300x1x8.py | 2 -- .../efficientnet-pytorch/config/config_common.py | 1 - .../efficientnet-pytorch/extern/trainer_adapter.py | 6 ------ .../efficientnet-pytorch/config/config_A100x1x2.py | 2 -- .../efficientnet-pytorch/config/config_A100x1x4.py | 2 -- .../efficientnet-pytorch/config/config_A100x1x8.py | 2 -- .../efficientnet-pytorch/config/config_common.py | 1 - 18 files changed, 31 insertions(+), 43 deletions(-) diff --git a/training/benchmarks/efficientnet/README.md b/training/benchmarks/efficientnet/README.md index 38071be69..db3d99de8 100644 --- a/training/benchmarks/efficientnet/README.md +++ b/training/benchmarks/efficientnet/README.md @@ -1,17 +1,20 @@ ### 模型信息 - 模型介绍 >EfficientNet is a convolutional neural network architecture and scaling method that uniformly scales all dimensions of depth/width/resolution using a compound coefficient. Unlike conventional practice that arbitrary scales these factors, the EfficientNet scaling method uniformly scales network width, depth, and resolution with a set of fixed scaling coefficients. ->Refer to EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks + +- 论文 +> [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/pdf/1905.11946.pdf) - 模型代码来源 > https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py ### 数据集 -- 数据集下载地址 -> `https://image-net.org/download.php` (Imagenet2012 1K) +> ImageNet_1k_2012数据集 + +> ImageNet官网地址:https://www.image-net.org/challenges/LSVRC/2012/ - 预处理 -> 无需预处理 +> 无需预处理 ### 框架与芯片支持情况 diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py index edf327f09..c9e71c291 100644 --- a/training/benchmarks/efficientnet/pytorch/config/_base.py +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -5,7 +5,7 @@ name: str = "EfficientNet" do_train = True -fp16 = True +fp16 = False # ========================================================= # data # ========================================================= diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py index e842767a4..9b367b9ea 100644 --- a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -1,6 +1,11 @@ mutable_params = [ 'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size', - 'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', "vendor" + 'eval_batch_size', 'dist_backend', 'vendor', + 'local_rank', 'do_train', 'data_dir', 'log_freq', 'output_dir', 'resume' +] +mutable_params += ['lr', 'lr_scheduler', 'lr_warmup_epochs', 'lr_warmup_method', + 'auto_augment', 'random_erase', 'label_smoothing', 'mixup_alpha', + 'cutmix_alpha', 'weight_decay', 'norm_weight_decay', 'model_ema', + 'ra_sampler', 'ra_reps', 'epochs', 'num_workers', 'train_crop_size', + 'val_crop_size', 'val_resize_size', 'train_batch_size', 'eval_batch_size' ] - -mutable_params += ["local_rank", "do_train", "data_dir", "log_freq", "output_dir", "resume"] diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py index 40783d56c..e16ba41c1 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py @@ -13,6 +13,7 @@ CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) from driver import dist_pytorch +from train import utils def build_train_dataset(args): @@ -55,10 +56,10 @@ def build_eval_dataset(args): def build_train_dataloader(train_dataset, args): - """Traing dataloaders.""" + """Training dataloaders.""" dist_pytorch.main_proc_print('building train dataloaders ...') - if torch.distributed.is_available() and torch.distributed.is_initialized(): + if utils.is_dist_avail_and_initialized(): if hasattr(args, "ra_sampler") and args.ra_sampler: train_sampler = RASampler(train_dataset, shuffle=True, repetitions=args.ra_reps) else: @@ -97,10 +98,10 @@ def collate_fn(batch): def build_eval_dataloader(eval_dataset, args): - """Traing and validation dataloaders.""" + """Training and validation dataloaders.""" dist_pytorch.main_proc_print('building eval dataloaders ...') - if torch.distributed.is_available() and torch.distributed.is_initialized(): + if utils.is_dist_avail_and_initialized(): val_sampler = torch.utils.data.distributed.DistributedSampler( eval_dataset, shuffle=False, drop_last=True) dist_pytorch.main_proc_print( diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 94bb3e401..4a091e0a0 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -1,6 +1,6 @@ import torch import torch.distributed as dist - +from train import utils class Evaluator: @@ -27,7 +27,7 @@ def evaluate(self, trainer): loss, acc1, acc5 = trainer.inference(batch) self.__update(loss.item(), acc1.item(), acc5.item(), batch[0].shape[0]) - if dist.is_available() and dist.is_initialized(): + if utils.is_dist_avail_and_initialized(): total = torch.tensor([ self.total_loss, self.total_acc1, self.total_acc5, self.total_size diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index 5330bae3f..ac50b698e 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -75,7 +75,7 @@ def train_one_epoch(self, dataloader): step_start_time = time.time() epoch_start_num_sample = state.num_trained_samples - if dist.is_available() and dist.is_initialized(): + if utils.is_dist_avail_and_initialized(): dataloader.sampler.set_epoch(state.epoch) for batch_idx, batch in enumerate(dataloader): @@ -147,7 +147,7 @@ def train_one_step(self, batch): self.model.train() state.loss, state.acc1, state.acc5 = self.forward(batch) self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.scaler) - if dist.is_available() and dist.is_initialized(): + if utils.is_dist_avail_and_initialized(): total = torch.tensor([state.loss, state.acc1, state.acc5], dtype=torch.float32, device=self.config.device) diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 501e9e25f..cee6e563b 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -9,7 +9,7 @@ from train import utils def convert_model(args, model: nn.Module) -> nn.Module: - if dist.is_available() and dist.is_initialized() and args.sync_bn: + if utils.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py index 93e8c94f9..b8b26b23e 100644 --- a/training/benchmarks/efficientnet/pytorch/train/training_state.py +++ b/training/benchmarks/efficientnet/pytorch/train/training_state.py @@ -10,7 +10,6 @@ class TrainingState: global_steps = 0 skipped_steps = 0 - iter_dataloader_idx = 0 loss: float = 0.0 acc1: float = 0.0 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py index fbaea0e5d..0b0a0006f 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py @@ -1,4 +1,4 @@ from config_common import * -train_batch_size = 64 -eval_batch_size = 64 +train_batch_size = 128 +eval_batch_size = 128 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py index fb4a39e51..ee760dece 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py @@ -1,6 +1,4 @@ from config_common import * train_batch_size = 128 -eval_batch_size = 128 - -gradient_accumulation_steps = 1 +eval_batch_size = 128 \ No newline at end of file diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py index 0b08a1cf4..0b0a0006f 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py @@ -1,6 +1,4 @@ from config_common import * -train_batch_size = 64 -eval_batch_size = 32 - -gradient_accumulation_steps = 1 +train_batch_size = 128 +eval_batch_size = 128 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py index fb4a39e51..0b0a0006f 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py @@ -2,5 +2,3 @@ train_batch_size = 128 eval_batch_size = 128 - -gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_common.py b/training/kunlunxin/efficientnet-pytorch/config/config_common.py index 6ee3fc54c..212075de6 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_common.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_common.py @@ -1,6 +1,5 @@ vendor = "kunlunxin" dist_backend = "xccl" -fp16 = False lr = 0.5 lr_scheduler = "cosineannealinglr" diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py index 4b1aede50..51008da61 100644 --- a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py @@ -1,15 +1,9 @@ from torch import nn -from driver.dist_pytorch import main_proc_print def convert_model(args, model: nn.Module) -> nn.Module: return model - def model_to_fp16(args, model): - # To prevent OOM for model sizes that cannot fit in GPU memory in full precision - if args.fp16: - main_proc_print(" > use fp16...") - model.half() return model def create_grad_scaler(args): return None diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py index fb4a39e51..0b0a0006f 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py @@ -2,5 +2,3 @@ train_batch_size = 128 eval_batch_size = 128 - -gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py index fb4a39e51..0b0a0006f 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py @@ -2,5 +2,3 @@ train_batch_size = 128 eval_batch_size = 128 - -gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py index fb4a39e51..0b0a0006f 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py +++ b/training/nvidia/efficientnet-pytorch/config/config_A100x1x8.py @@ -2,5 +2,3 @@ train_batch_size = 128 eval_batch_size = 128 - -gradient_accumulation_steps = 1 diff --git a/training/nvidia/efficientnet-pytorch/config/config_common.py b/training/nvidia/efficientnet-pytorch/config/config_common.py index 8f0cda1b6..d550fb4dc 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_common.py +++ b/training/nvidia/efficientnet-pytorch/config/config_common.py @@ -1,6 +1,5 @@ vendor = "nvidia" dist_backend = "nccl" -fp16 = False lr = 0.5 lr_scheduler = "cosineannealinglr" From e6005bf4e70f0a322fd66056186545287627570a Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Fri, 26 May 2023 09:46:07 +0000 Subject: [PATCH 13/24] fix code style --- .../pytorch/config/mutable_params.py | 15 ++-- .../pytorch/dataloaders/dataloader.py | 34 +++++--- .../pytorch/dataloaders/presets.py | 65 ++++++++------ .../pytorch/dataloaders/sampler.py | 27 ++++-- .../pytorch/dataloaders/transforms.py | 71 +++++++++------- .../efficientnet/pytorch/run_pretraining.py | 3 +- .../pytorch/schedulers/__init__.py | 29 ++++--- .../efficientnet/pytorch/train/evaluator.py | 4 +- .../efficientnet/pytorch/train/trainer.py | 23 +++-- .../pytorch/train/trainer_adapter.py | 36 +++++--- .../pytorch/train/training_state.py | 5 +- .../efficientnet/pytorch/train/utils.py | 85 +++++++++++-------- .../config/config_common.py | 1 - .../extern/trainer_adapter.py | 4 + .../config/config_common.py | 1 - 15 files changed, 252 insertions(+), 151 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py index 9b367b9ea..bf4c1e7c5 100644 --- a/training/benchmarks/efficientnet/pytorch/config/mutable_params.py +++ b/training/benchmarks/efficientnet/pytorch/config/mutable_params.py @@ -1,11 +1,12 @@ mutable_params = [ 'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size', - 'eval_batch_size', 'dist_backend', 'vendor', - 'local_rank', 'do_train', 'data_dir', 'log_freq', 'output_dir', 'resume' + 'eval_batch_size', 'dist_backend', 'vendor', 'local_rank', 'do_train', + 'data_dir', 'log_freq', 'output_dir', 'resume' ] -mutable_params += ['lr', 'lr_scheduler', 'lr_warmup_epochs', 'lr_warmup_method', - 'auto_augment', 'random_erase', 'label_smoothing', 'mixup_alpha', - 'cutmix_alpha', 'weight_decay', 'norm_weight_decay', 'model_ema', - 'ra_sampler', 'ra_reps', 'epochs', 'num_workers', 'train_crop_size', - 'val_crop_size', 'val_resize_size', 'train_batch_size', 'eval_batch_size' +mutable_params += [ + 'lr', 'lr_scheduler', 'lr_warmup_epochs', 'lr_warmup_method', + 'auto_augment', 'random_erase', 'label_smoothing', 'mixup_alpha', + 'cutmix_alpha', 'weight_decay', 'norm_weight_decay', 'ra_sampler', + 'ra_reps', 'epochs', 'num_workers', 'train_crop_size', 'val_crop_size', + 'val_resize_size', 'train_batch_size', 'eval_batch_size' ] diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py index e16ba41c1..2bcc83841 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py @@ -9,7 +9,6 @@ from torchvision.transforms.functional import InterpolationMode import torchvision - CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) from driver import dist_pytorch @@ -44,12 +43,13 @@ def build_eval_dataset(args): valdir = os.path.join(args.data_dir, args.eval_data) interpolation = InterpolationMode(args.interpolation) preprocessing = presets.ClassificationPresetEval( - crop_size=args.val_crop_size, resize_size=args.val_resize_size, interpolation=interpolation - ) + crop_size=args.val_crop_size, + resize_size=args.val_resize_size, + interpolation=interpolation) val_dataset = torchvision.datasets.ImageFolder( - valdir, - preprocessing, + valdir, + preprocessing, ) return val_dataset @@ -61,9 +61,12 @@ def build_train_dataloader(train_dataset, args): if utils.is_dist_avail_and_initialized(): if hasattr(args, "ra_sampler") and args.ra_sampler: - train_sampler = RASampler(train_dataset, shuffle=True, repetitions=args.ra_reps) + train_sampler = RASampler(train_dataset, + shuffle=True, + repetitions=args.ra_reps) else: - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset) dist_pytorch.main_proc_print( f"use sampler: DistributedSampler, num_replicas:{args.n_device}") else: @@ -73,9 +76,13 @@ def build_train_dataloader(train_dataset, args): num_classes = len(train_dataset.classes) mixup_transforms = [] if args.mixup_alpha > 0.0: - mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha)) + mixup_transforms.append( + transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha)) if args.cutmix_alpha > 0.0: - mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha)) + mixup_transforms.append( + transforms.RandomCutmix(num_classes, + p=1.0, + alpha=args.cutmix_alpha)) if mixup_transforms: mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) @@ -108,10 +115,13 @@ def build_eval_dataloader(eval_dataset, args): f"use sampler: DistributedSampler, num_replicas:{args.n_device}") else: val_sampler = torch.utils.data.SequentialSampler(eval_dataset) - + eval_dataloader = torch.utils.data.DataLoader( - eval_dataset, batch_size=args.eval_batch_size, sampler=val_sampler, num_workers=args.num_workers, pin_memory=True - ) + eval_dataset, + batch_size=args.eval_batch_size, + sampler=val_sampler, + num_workers=args.num_workers, + pin_memory=True) dist_pytorch.main_proc_print( f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}') diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py b/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py index 5d1bf1cc7..a9ee3e849 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/presets.py @@ -4,6 +4,7 @@ class ClassificationPresetTrain: + def __init__( self, *, @@ -17,26 +18,35 @@ def __init__( augmix_severity=3, random_erase_prob=0.0, ): - trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + trans = [ + transforms.RandomResizedCrop(crop_size, + interpolation=interpolation) + ] if hflip_prob > 0: trans.append(transforms.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: if auto_augment_policy == "ra": - trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) + trans.append( + autoaugment.RandAugment(interpolation=interpolation, + magnitude=ra_magnitude)) elif auto_augment_policy == "ta_wide": - trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) + trans.append( + autoaugment.TrivialAugmentWide( + interpolation=interpolation)) elif auto_augment_policy == "augmix": - trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity)) + trans.append( + autoaugment.AugMix(interpolation=interpolation, + severity=augmix_severity)) else: aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy) - trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation)) - trans.extend( - [ - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), - ] - ) + trans.append( + autoaugment.AutoAugment(policy=aa_policy, + interpolation=interpolation)) + trans.extend([ + transforms.PILToTensor(), + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=mean, std=std), + ]) if random_erase_prob > 0: trans.append(transforms.RandomErasing(p=random_erase_prob)) @@ -47,25 +57,24 @@ def __call__(self, img): class ClassificationPresetEval: + def __init__( - self, - *, - crop_size, - resize_size=256, - mean=(0.485, 0.456, 0.406), - std=(0.229, 0.224, 0.225), - interpolation=InterpolationMode.BILINEAR, + self, + *, + crop_size, + resize_size=256, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + interpolation=InterpolationMode.BILINEAR, ): - self.transforms = transforms.Compose( - [ - transforms.Resize(resize_size, interpolation=interpolation), - transforms.CenterCrop(crop_size), - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), - ] - ) + self.transforms = transforms.Compose([ + transforms.Resize(resize_size, interpolation=interpolation), + transforms.CenterCrop(crop_size), + transforms.PILToTensor(), + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=mean, std=std), + ]) def __call__(self, img): return self.transforms(img) diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py b/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py index e9dc1735a..0ce118c51 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/sampler.py @@ -15,22 +15,33 @@ class RASampler(torch.utils.data.Sampler): https://github.com/facebookresearch/deit/blob/main/samplers.py """ - def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, repetitions=3): + def __init__(self, + dataset, + num_replicas=None, + rank=None, + shuffle=True, + seed=0, + repetitions=3): if num_replicas is None: if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available!") + raise RuntimeError( + "Requires distributed package to be available!") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available!") + raise RuntimeError( + "Requires distributed package to be available!") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 - self.num_samples = int(math.ceil(len(self.dataset) * float(repetitions) / self.num_replicas)) + self.num_samples = int( + math.ceil( + len(self.dataset) * float(repetitions) / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas - self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) + self.num_selected_samples = int( + math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) self.shuffle = shuffle self.seed = seed self.repetitions = repetitions @@ -46,14 +57,14 @@ def __iter__(self): # Add extra samples to make it evenly divisible indices = [ele for ele in indices for i in range(self.repetitions)] - indices += indices[: (self.total_size - len(indices))] + indices += indices[:(self.total_size - len(indices))] assert len(indices) == self.total_size # Subsample - indices = indices[self.rank : self.total_size : self.num_replicas] + indices = indices[self.rank:self.total_size:self.num_replicas] assert len(indices) == self.num_samples - return iter(indices[: self.num_selected_samples]) + return iter(indices[:self.num_selected_samples]) def __len__(self): return self.num_selected_samples diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py b/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py index 9a8ef7877..fa00c270e 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/transforms.py @@ -19,7 +19,11 @@ class RandomMixup(torch.nn.Module): inplace (bool): boolean to make this transform inplace. Default set to False. """ - def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None: + def __init__(self, + num_classes: int, + p: float = 0.5, + alpha: float = 1.0, + inplace: bool = False) -> None: super().__init__() if num_classes < 1: @@ -49,16 +53,19 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: if target.ndim != 1: raise ValueError(f"Target ndim should be 1. Got {target.ndim}") if not batch.is_floating_point(): - raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.") + raise TypeError( + f"Batch dtype should be a float tensor. Got {batch.dtype}.") if target.dtype != torch.int64: - raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}") + raise TypeError( + f"Target dtype should be torch.int64. Got {target.dtype}") if not self.inplace: batch = batch.clone() target = target.clone() if target.ndim == 1: - target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype) + target = torch.nn.functional.one_hot( + target, num_classes=self.num_classes).to(dtype=batch.dtype) if torch.rand(1).item() >= self.p: return batch, target @@ -68,7 +75,8 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: target_rolled = target.roll(1, 0) # Implemented as on mixup paper, page 3. - lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) + lambda_param = float( + torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) batch_rolled.mul_(1.0 - lambda_param) batch.mul_(lambda_param).add_(batch_rolled) @@ -78,14 +86,12 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: return batch, target def __repr__(self) -> str: - s = ( - f"{self.__class__.__name__}(" - f"num_classes={self.num_classes}" - f", p={self.p}" - f", alpha={self.alpha}" - f", inplace={self.inplace}" - f")" - ) + s = (f"{self.__class__.__name__}(" + f"num_classes={self.num_classes}" + f", p={self.p}" + f", alpha={self.alpha}" + f", inplace={self.inplace}" + f")") return s @@ -103,10 +109,15 @@ class RandomCutmix(torch.nn.Module): inplace (bool): boolean to make this transform inplace. Default set to False. """ - def __init__(self, num_classes: int, p: float = 0.5, alpha: float = 1.0, inplace: bool = False) -> None: + def __init__(self, + num_classes: int, + p: float = 0.5, + alpha: float = 1.0, + inplace: bool = False) -> None: super().__init__() if num_classes < 1: - raise ValueError("Please provide a valid positive value for the num_classes.") + raise ValueError( + "Please provide a valid positive value for the num_classes.") if alpha <= 0: raise ValueError("Alpha param can't be zero.") @@ -129,16 +140,19 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: if target.ndim != 1: raise ValueError(f"Target ndim should be 1. Got {target.ndim}") if not batch.is_floating_point(): - raise TypeError(f"Batch dtype should be a float tensor. Got {batch.dtype}.") + raise TypeError( + f"Batch dtype should be a float tensor. Got {batch.dtype}.") if target.dtype != torch.int64: - raise TypeError(f"Target dtype should be torch.int64. Got {target.dtype}") + raise TypeError( + f"Target dtype should be torch.int64. Got {target.dtype}") if not self.inplace: batch = batch.clone() target = target.clone() if target.ndim == 1: - target = torch.nn.functional.one_hot(target, num_classes=self.num_classes).to(dtype=batch.dtype) + target = torch.nn.functional.one_hot( + target, num_classes=self.num_classes).to(dtype=batch.dtype) if torch.rand(1).item() >= self.p: return batch, target @@ -148,11 +162,12 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: target_rolled = target.roll(1, 0) # Implemented as on cutmix paper, page 12 (with minor corrections on typos). - lambda_param = float(torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) + lambda_param = float( + torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]) _, H, W = F.get_dimensions(batch) - r_x = torch.randint(W, (1,)) - r_y = torch.randint(H, (1,)) + r_x = torch.randint(W, (1, )) + r_y = torch.randint(H, (1, )) r = 0.5 * math.sqrt(1.0 - lambda_param) r_w_half = int(r * W) @@ -172,12 +187,10 @@ def forward(self, batch: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: return batch, target def __repr__(self) -> str: - s = ( - f"{self.__class__.__name__}(" - f"num_classes={self.num_classes}" - f", p={self.p}" - f", alpha={self.alpha}" - f", inplace={self.inplace}" - f")" - ) + s = (f"{self.__class__.__name__}(" + f"num_classes={self.num_classes}" + f", p={self.p}" + f", alpha={self.alpha}" + f", inplace={self.inplace}" + f")") return s diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py index c097442b2..9a61f9303 100755 --- a/training/benchmarks/efficientnet/pytorch/run_pretraining.py +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -57,7 +57,8 @@ def main() -> Tuple[Any, Any]: dist_pytorch.barrier(config.vendor) init_evaluation_start = time.time() - training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate(trainer) + training_state.eval_loss, training_state.eval_acc1, training_state.eval_acc5 = evaluator.evaluate( + trainer) init_evaluation_end = time.time() init_evaluation_info = dict(eval_acc1=training_state.eval_acc1, eval_acc5=training_state.eval_acc5, diff --git a/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py index aff4fce9f..08051c4d1 100644 --- a/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py +++ b/training/benchmarks/efficientnet/pytorch/schedulers/__init__.py @@ -6,35 +6,40 @@ def create_scheduler(args, optimizer): args.lr_scheduler = args.lr_scheduler.lower() if args.lr_scheduler == "steplr": - main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) + main_lr_scheduler = torch.optim.lr_scheduler.StepLR( + optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) elif args.lr_scheduler == "cosineannealinglr": main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min - ) + optimizer, + T_max=args.epochs - args.lr_warmup_epochs, + eta_min=args.lr_min) elif args.lr_scheduler == "exponentiallr": - main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma) + main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( + optimizer, gamma=args.lr_gamma) else: raise RuntimeError( f"Invalid lr scheduler '{args.lr_scheduler}'. Only StepLR, CosineAnnealingLR and ExponentialLR " - "are supported." - ) + "are supported.") if args.lr_warmup_epochs > 0: if args.lr_warmup_method == "linear": warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs - ) + optimizer, + start_factor=args.lr_warmup_decay, + total_iters=args.lr_warmup_epochs) elif args.lr_warmup_method == "constant": warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( - optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs - ) + optimizer, + factor=args.lr_warmup_decay, + total_iters=args.lr_warmup_epochs) else: raise RuntimeError( f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported." ) lr_scheduler = torch.optim.lr_scheduler.SequentialLR( - optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs] - ) + optimizer, + schedulers=[warmup_lr_scheduler, main_lr_scheduler], + milestones=[args.lr_warmup_epochs]) else: lr_scheduler = main_lr_scheduler return lr_scheduler diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 4a091e0a0..da98aacc5 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -2,6 +2,7 @@ import torch.distributed as dist from train import utils + class Evaluator: def __init__(self, args, dataloader): @@ -25,7 +26,8 @@ def evaluate(self, trainer): for i, batch in enumerate(self.dataloader): batch = trainer.process_batch(batch, self.args.device) loss, acc1, acc5 = trainer.inference(batch) - self.__update(loss.item(), acc1.item(), acc5.item(), batch[0].shape[0]) + self.__update(loss.item(), acc1.item(), acc5.item(), + batch[0].shape[0]) if utils.is_dist_avail_and_initialized(): total = torch.tensor([ diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index ac50b698e..317cf8d7c 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -18,6 +18,8 @@ CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import Driver, Event, dist_pytorch + + class Trainer: def __init__(self, driver: Driver, adapter, evaluator: Evaluator, @@ -46,13 +48,14 @@ def init(self): self.lr_scheduler = create_scheduler(self.config, self.optimizer) self.scaler = self.adapter.create_grad_scaler(self.config) - self.criterion = torch.nn.CrossEntropyLoss(label_smoothing=self.config.label_smoothing) + self.criterion = torch.nn.CrossEntropyLoss( + label_smoothing=self.config.label_smoothing) self.resume() def _init_model(self, model, args, device): model = model.to(device) return model - + def resume(self): args = self.config if args.resume and os.path.isfile(args.resume): @@ -92,7 +95,8 @@ def train_one_epoch(self, dataloader): step_end_time = time.time() step_total_time = step_end_time - step_start_time step_start_time = step_end_time - images_per_second = dist_pytorch.global_batch_size(self.config) / step_total_time + images_per_second = dist_pytorch.global_batch_size( + self.config) / step_total_time other_state["img/s"] = images_per_second if hasattr(self.optimizer, 'loss_scaler'): loss_scale = self.optimizer.loss_scaler.loss_scale @@ -136,8 +140,13 @@ def train_one_epoch(self, dataloader): } if self.scaler: checkpoint["scaler"] = self.scaler.state_dict() - utils.save_on_master(checkpoint, os.path.join(self.config.output_dir, f"model_{self.training_state.epoch}.pth")) - utils.save_on_master(checkpoint, os.path.join(self.config.output_dir, "checkpoint.pth")) + utils.save_on_master( + checkpoint, + os.path.join(self.config.output_dir, + f"model_{self.training_state.epoch}.pth")) + utils.save_on_master( + checkpoint, + os.path.join(self.config.output_dir, "checkpoint.pth")) driver.event(Event.EPOCH_END, state.epoch) def train_one_step(self, batch): @@ -146,7 +155,9 @@ def train_one_step(self, batch): state = self.training_state self.model.train() state.loss, state.acc1, state.acc5 = self.forward(batch) - self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.scaler) + self.adapter.backward(self.config, state.global_steps, state.epoch, + state.loss, self.model, self.optimizer, + self.scaler) if utils.is_dist_avail_and_initialized(): total = torch.tensor([state.loss, state.acc1, state.acc5], dtype=torch.float32, diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index cee6e563b..2b4bc8f72 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -8,6 +8,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from train import utils + def convert_model(args, model: nn.Module) -> nn.Module: if utils.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) @@ -19,15 +20,20 @@ def create_optimizer(args, model): if args.bias_weight_decay is not None: custom_keys_weight_decay.append(("bias", args.bias_weight_decay)) if args.transformer_embedding_decay is not None: - for key in ["class_token", "position_embedding", "relative_position_bias_table"]: - custom_keys_weight_decay.append((key, args.transformer_embedding_decay)) + for key in [ + "class_token", "position_embedding", + "relative_position_bias_table" + ]: + custom_keys_weight_decay.append( + (key, args.transformer_embedding_decay)) parameters = utils.set_weight_decay( model, args.weight_decay, norm_weight_decay=args.norm_weight_decay, - custom_keys_weight_decay=custom_keys_weight_decay if len(custom_keys_weight_decay) > 0 else None, + custom_keys_weight_decay=custom_keys_weight_decay + if len(custom_keys_weight_decay) > 0 else None, ) - + opt_name = args.opt.lower() if opt_name.startswith("sgd"): optimizer = torch.optim.SGD( @@ -38,13 +44,20 @@ def create_optimizer(args, model): nesterov="nesterov" in opt_name, ) elif opt_name == "rmsprop": - optimizer = torch.optim.RMSprop( - parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=0.0316, alpha=0.9 - ) + optimizer = torch.optim.RMSprop(parameters, + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + eps=0.0316, + alpha=0.9) elif opt_name == "adamw": - optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay) + optimizer = torch.optim.AdamW(parameters, + lr=args.lr, + weight_decay=args.weight_decay) else: - raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.") + raise RuntimeError( + f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported." + ) return optimizer @@ -61,11 +74,14 @@ def model_to_ddp(args, model: nn.Module) -> nn.Module: model = DDP(model, device_ids=[args.local_rank]) return model + def create_grad_scaler(args): scaler = torch.cuda.amp.GradScaler() if args.amp else None return scaler -def backward(args, step: int, epoch: int, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, scaler): + +def backward(args, step: int, epoch: int, loss: torch.Tensor, model: nn.Module, + optimizer: Optimizer, scaler): optimizer.zero_grad() if scaler is not None: scaler.scale(loss).backward() diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py index b8b26b23e..7a81bb888 100644 --- a/training/benchmarks/efficientnet/pytorch/train/training_state.py +++ b/training/benchmarks/efficientnet/pytorch/train/training_state.py @@ -77,7 +77,10 @@ def state_dict(self): It contains an entry for every variable in self.__dict__ """ - return {key: value for key, value in self.__dict__.items() if key != '_trainer'} + return { + key: value + for key, value in self.__dict__.items() if key != '_trainer' + } def load_state_dict(self, state_dict): """Loads the schedulers state. diff --git a/training/benchmarks/efficientnet/pytorch/train/utils.py b/training/benchmarks/efficientnet/pytorch/train/utils.py index d12f66f3f..89e5b83ca 100644 --- a/training/benchmarks/efficientnet/pytorch/train/utils.py +++ b/training/benchmarks/efficientnet/pytorch/train/utils.py @@ -61,12 +61,15 @@ def value(self): return self.deque[-1] def __str__(self): - return self.fmt.format( - median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value - ) + return self.fmt.format(median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) class MetricLogger: + def __init__(self, delimiter="\t"): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter @@ -83,7 +86,8 @@ def __getattr__(self, attr): return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'") + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'") def __str__(self): loss_str = [] @@ -108,21 +112,20 @@ def log_every(self, iterable, print_freq, header=None): data_time = SmoothedValue(fmt="{avg:.4f}") space_fmt = ":" + str(len(str(len(iterable)))) + "d" if torch.cuda.is_available(): - log_msg = self.delimiter.join( - [ - header, - "[{0" + space_fmt + "}/{1}]", - "eta: {eta}", - "{meters}", - "time: {time}", - "data: {data}", - "max mem: {memory:.0f}", - ] - ) + log_msg = self.delimiter.join([ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + "max mem: {memory:.0f}", + ]) else: - log_msg = self.delimiter.join( - [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"] - ) + log_msg = self.delimiter.join([ + header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", + "time: {time}", "data: {data}" + ]) MB = 1024.0 * 1024.0 for obj in iterable: data_time.update(time.time() - end) @@ -141,14 +144,15 @@ def log_every(self, iterable, print_freq, header=None): time=str(iter_time), data=str(data_time), memory=torch.cuda.max_memory_allocated() / MB, - ) - ) + )) else: print( - log_msg.format( - i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time) - ) - ) + log_msg.format(i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time))) i += 1 end = time.time() total_time = time.time() - start_time @@ -164,13 +168,14 @@ class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): """ def __init__(self, model, decay, device="cpu"): + def ema_avg(avg_model_param, model_param, num_averaged): return decay * avg_model_param + (1 - decay) * model_param super().__init__(model, device, ema_avg, use_buffers=True) -def accuracy(output, target, topk=(1,)): +def accuracy(output, target, topk=(1, )): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) @@ -188,6 +193,7 @@ def accuracy(output, target, topk=(1,)): res.append(correct_k * (100.0 / batch_size)) return res + def mkdir(path): try: os.makedirs(path) @@ -260,10 +266,12 @@ def init_distributed_mode(args): torch.cuda.set_device(args.gpu) args.dist_backend = "nccl" - print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True) - torch.distributed.init_process_group( - backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank - ) + print(f"| distributed init (rank {args.rank}): {args.dist_url}", + flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0) @@ -287,7 +295,8 @@ def average_checkpoints(inputs): with open(fpath, "rb") as f: state = torch.load( f, - map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")), + map_location=(lambda s, _: torch.serialization. + default_restore_location(s, "cpu")), ) # Copies over the settings from the first checkpoint if new_state is None: @@ -320,7 +329,10 @@ def average_checkpoints(inputs): return new_state -def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=True): +def store_model_weights(model, + checkpoint_path, + checkpoint_key="model", + strict=True): """ This method can be used to prepare weights files for new models. It receives as input a model architecture and a checkpoint from the training script and produces @@ -372,7 +384,8 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T # and remove unnecessary weights (such as auxiliaries, etc.) if checkpoint_key == "model_ema": del checkpoint[checkpoint_key]["n_averaged"] - torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.") + torch.nn.modules.utils.consume_prefix_in_state_dict_if_present( + checkpoint[checkpoint_key], "module.") model.load_state_dict(checkpoint[checkpoint_key], strict=strict) tmp_path = os.path.join(output_dir, str(model.__hash__())) @@ -446,7 +459,8 @@ def _add_params(module, prefix=""): is_custom_key = True break if not is_custom_key: - if norm_weight_decay is not None and isinstance(module, norm_classes): + if norm_weight_decay is not None and isinstance( + module, norm_classes): params["norm"].append(p) else: params["other"].append(p) @@ -460,5 +474,8 @@ def _add_params(module, prefix=""): param_groups = [] for key in params: if len(params[key]) > 0: - param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]}) + param_groups.append({ + "params": params[key], + "weight_decay": params_weight_decay[key] + }) return param_groups diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_common.py b/training/kunlunxin/efficientnet-pytorch/config/config_common.py index 212075de6..43296a7ee 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_common.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_common.py @@ -12,7 +12,6 @@ cutmix_alpha = 1.0 weight_decay = 0.00002 norm_weight_decay = 0.0 -model_ema = True ra_sampler = True ra_reps = 4 epochs = 600 diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py index 51008da61..289953195 100644 --- a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py @@ -3,7 +3,11 @@ def convert_model(args, model: nn.Module) -> nn.Module: return model + + def model_to_fp16(args, model): return model + + def create_grad_scaler(args): return None diff --git a/training/nvidia/efficientnet-pytorch/config/config_common.py b/training/nvidia/efficientnet-pytorch/config/config_common.py index d550fb4dc..4cbe939fb 100644 --- a/training/nvidia/efficientnet-pytorch/config/config_common.py +++ b/training/nvidia/efficientnet-pytorch/config/config_common.py @@ -12,7 +12,6 @@ cutmix_alpha = 1.0 weight_decay = 0.00002 norm_weight_decay = 0.0 -model_ema = True ra_sampler = True ra_reps = 4 epochs = 600 From ae861095fbd0bbc31ebac82962e821f65c224ae1 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 03:35:43 +0000 Subject: [PATCH 14/24] fix code style --- .../efficientnet/pytorch/train/trainer_adapter.py | 9 +++++++++ .../kunlunxin/efficientnet-pytorch/extern/.gitkeep | 0 .../efficientnet-pytorch/extern/trainer_adapter.py | 13 ------------- 3 files changed, 9 insertions(+), 13 deletions(-) create mode 100644 training/kunlunxin/efficientnet-pytorch/extern/.gitkeep delete mode 100644 training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 2b4bc8f72..5fe3fd09d 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -10,6 +10,9 @@ def convert_model(args, model: nn.Module) -> nn.Module: + if args.vendor == 'kunlunxin': + # not support yet + return model if utils.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model @@ -62,6 +65,9 @@ def create_optimizer(args, model): def model_to_fp16(args, model): + if args.vendor == 'kunlunxin': + # not support yet + return model # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.fp16: main_proc_print(" > use fp16...") @@ -76,6 +82,9 @@ def model_to_ddp(args, model: nn.Module) -> nn.Module: def create_grad_scaler(args): + if args.vendor == 'kunlunxin': + # not support yet + return None scaler = torch.cuda.amp.GradScaler() if args.amp else None return scaler diff --git a/training/kunlunxin/efficientnet-pytorch/extern/.gitkeep b/training/kunlunxin/efficientnet-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py deleted file mode 100644 index 289953195..000000000 --- a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py +++ /dev/null @@ -1,13 +0,0 @@ -from torch import nn - - -def convert_model(args, model: nn.Module) -> nn.Module: - return model - - -def model_to_fp16(args, model): - return model - - -def create_grad_scaler(args): - return None From fe6a41889650da63734265e99e7453d1d285d90a Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 07:23:28 +0000 Subject: [PATCH 15/24] Revert "fix code style" This reverts commit ae861095fbd0bbc31ebac82962e821f65c224ae1. --- .../efficientnet/pytorch/train/trainer_adapter.py | 9 --------- .../kunlunxin/efficientnet-pytorch/extern/.gitkeep | 0 .../efficientnet-pytorch/extern/trainer_adapter.py | 13 +++++++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) delete mode 100644 training/kunlunxin/efficientnet-pytorch/extern/.gitkeep create mode 100644 training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 5fe3fd09d..2b4bc8f72 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -10,9 +10,6 @@ def convert_model(args, model: nn.Module) -> nn.Module: - if args.vendor == 'kunlunxin': - # not support yet - return model if utils.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model @@ -65,9 +62,6 @@ def create_optimizer(args, model): def model_to_fp16(args, model): - if args.vendor == 'kunlunxin': - # not support yet - return model # To prevent OOM for model sizes that cannot fit in GPU memory in full precision if args.fp16: main_proc_print(" > use fp16...") @@ -82,9 +76,6 @@ def model_to_ddp(args, model: nn.Module) -> nn.Module: def create_grad_scaler(args): - if args.vendor == 'kunlunxin': - # not support yet - return None scaler = torch.cuda.amp.GradScaler() if args.amp else None return scaler diff --git a/training/kunlunxin/efficientnet-pytorch/extern/.gitkeep b/training/kunlunxin/efficientnet-pytorch/extern/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..289953195 --- /dev/null +++ b/training/kunlunxin/efficientnet-pytorch/extern/trainer_adapter.py @@ -0,0 +1,13 @@ +from torch import nn + + +def convert_model(args, model: nn.Module) -> nn.Module: + return model + + +def model_to_fp16(args, model): + return model + + +def create_grad_scaler(args): + return None From 6684a5df4c13b62e07dc27e463b9590d835265bd Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 07:45:41 +0000 Subject: [PATCH 16/24] fix code style --- training/benchmarks/driver/dist_pytorch.py | 6 ++ .../efficientnet/pytorch/train/evaluator.py | 8 +- .../efficientnet/pytorch/train/trainer.py | 4 +- .../pytorch/train/trainer_adapter.py | 8 +- .../efficientnet/pytorch/train/utils.py | 78 ++----------------- 5 files changed, 27 insertions(+), 77 deletions(-) diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py index 65e6a29d5..c46133a90 100755 --- a/training/benchmarks/driver/dist_pytorch.py +++ b/training/benchmarks/driver/dist_pytorch.py @@ -226,6 +226,12 @@ def format_step(step): s += "Validation Iteration: {} ".format(step[2]) return s +def is_dist_avail_and_initialized(): + if not torch.distributed.is_available(): + return False + if not torch.distributed.is_initialized(): + return False + return True class PyTorchDistributedDataParallel(DDP): diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index da98aacc5..71ab102b2 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -1,7 +1,11 @@ import torch import torch.distributed as dist from train import utils - +import os +import sys +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import dist_pytorch class Evaluator: @@ -29,7 +33,7 @@ def evaluate(self, trainer): self.__update(loss.item(), acc1.item(), acc5.item(), batch[0].shape[0]) - if utils.is_dist_avail_and_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): total = torch.tensor([ self.total_loss, self.total_acc1, self.total_acc5, self.total_size diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py index 317cf8d7c..8d84c09ce 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer.py @@ -78,7 +78,7 @@ def train_one_epoch(self, dataloader): step_start_time = time.time() epoch_start_num_sample = state.num_trained_samples - if utils.is_dist_avail_and_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): dataloader.sampler.set_epoch(state.epoch) for batch_idx, batch in enumerate(dataloader): @@ -158,7 +158,7 @@ def train_one_step(self, batch): self.adapter.backward(self.config, state.global_steps, state.epoch, state.loss, self.model, self.optimizer, self.scaler) - if utils.is_dist_avail_and_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): total = torch.tensor([state.loss, state.acc1, state.acc5], dtype=torch.float32, device=self.config.device) diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 2b4bc8f72..ffc66df8b 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -7,10 +7,14 @@ from typing import Tuple from torch.nn.parallel import DistributedDataParallel as DDP from train import utils - +import os +import sys +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import dist_pytorch def convert_model(args, model: nn.Module) -> nn.Module: - if utils.is_dist_avail_and_initialized() and args.sync_bn: + if dist_pytorch.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) return model diff --git a/training/benchmarks/efficientnet/pytorch/train/utils.py b/training/benchmarks/efficientnet/pytorch/train/utils.py index 89e5b83ca..88c3779e3 100644 --- a/training/benchmarks/efficientnet/pytorch/train/utils.py +++ b/training/benchmarks/efficientnet/pytorch/train/utils.py @@ -3,12 +3,17 @@ import errno import hashlib import os +import sys import time from collections import defaultdict, deque, OrderedDict from typing import List, Optional, Tuple import torch import torch.distributed as dist +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import dist_pytorch + class SmoothedValue: @@ -202,80 +207,11 @@ def mkdir(path): raise -def setup_for_distributed(is_master): - """ - This function disables printing when not in master process - """ - import builtins as __builtin__ - - builtin_print = __builtin__.print - - def print(*args, **kwargs): - force = kwargs.pop("force", False) - if is_master or force: - builtin_print(*args, **kwargs) - - __builtin__.print = print - - -def is_dist_avail_and_initialized(): - if not dist.is_available(): - return False - if not dist.is_initialized(): - return False - return True - - -def get_world_size(): - if not is_dist_avail_and_initialized(): - return 1 - return dist.get_world_size() - - -def get_rank(): - if not is_dist_avail_and_initialized(): - return 0 - return dist.get_rank() - - -def is_main_process(): - return get_rank() == 0 - - def save_on_master(*args, **kwargs): - if is_main_process(): + if dist_pytorch.is_main_process(): torch.save(*args, **kwargs) -def init_distributed_mode(args): - if "RANK" in os.environ and "WORLD_SIZE" in os.environ: - args.rank = int(os.environ["RANK"]) - args.world_size = int(os.environ["WORLD_SIZE"]) - args.gpu = int(os.environ["LOCAL_RANK"]) - elif "SLURM_PROCID" in os.environ: - args.rank = int(os.environ["SLURM_PROCID"]) - args.gpu = args.rank % torch.cuda.device_count() - elif hasattr(args, "rank"): - pass - else: - print("Not using distributed mode") - args.distributed = False - return - - args.distributed = True - - torch.cuda.set_device(args.gpu) - args.dist_backend = "nccl" - print(f"| distributed init (rank {args.rank}): {args.dist_url}", - flush=True) - torch.distributed.init_process_group(backend=args.dist_backend, - init_method=args.dist_url, - world_size=args.world_size, - rank=args.rank) - torch.distributed.barrier() - setup_for_distributed(args.rank == 0) - - def average_checkpoints(inputs): """Loads checkpoints from inputs and returns a model with averaged weights. Original implementation taken from: https://github.com/pytorch/fairseq/blob/a48f235636557b8d3bc4922a6fa90f3a0fa57955/scripts/average_checkpoints.py#L16 @@ -405,7 +341,7 @@ def store_model_weights(model, def reduce_across_processes(val): - if not is_dist_avail_and_initialized(): + if not dist_pytorch.is_dist_avail_and_initialized(): # nothing to sync, but we still convert to tensor for consistency with the distributed case. return torch.tensor(val) From 746377a9ffaed077ec5e77d6dee7f0bf6dc18354 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 07:55:36 +0000 Subject: [PATCH 17/24] fix code style --- .../benchmarks/efficientnet/pytorch/dataloaders/dataloader.py | 4 ++-- training/benchmarks/efficientnet/pytorch/run_pretraining.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py index 2bcc83841..fe78a8d20 100644 --- a/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/efficientnet/pytorch/dataloaders/dataloader.py @@ -59,7 +59,7 @@ def build_train_dataloader(train_dataset, args): """Training dataloaders.""" dist_pytorch.main_proc_print('building train dataloaders ...') - if utils.is_dist_avail_and_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): if hasattr(args, "ra_sampler") and args.ra_sampler: train_sampler = RASampler(train_dataset, shuffle=True, @@ -108,7 +108,7 @@ def build_eval_dataloader(eval_dataset, args): """Training and validation dataloaders.""" dist_pytorch.main_proc_print('building eval dataloaders ...') - if utils.is_dist_avail_and_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): val_sampler = torch.utils.data.distributed.DistributedSampler( eval_dataset, shuffle=False, drop_last=True) dist_pytorch.main_proc_print( diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py index 9a61f9303..daef61526 100755 --- a/training/benchmarks/efficientnet/pytorch/run_pretraining.py +++ b/training/benchmarks/efficientnet/pytorch/run_pretraining.py @@ -80,7 +80,7 @@ def main() -> Tuple[Any, Any]: while training_state.epoch < config.epochs and \ not training_state.end_training: trainer.train_one_epoch(train_dataloader) - training_state.epoch += epoch + training_state.epoch += 1 model_driver.event(Event.TRAIN_END) raw_train_end_time = logger.previous_log_time From b3d978640f6e86d5e9a827a5fd4715e9868191e1 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 07:57:50 +0000 Subject: [PATCH 18/24] fix code style --- training/benchmarks/driver/dist_pytorch.py | 2 ++ training/benchmarks/efficientnet/pytorch/train/evaluator.py | 2 ++ .../benchmarks/efficientnet/pytorch/train/trainer_adapter.py | 2 ++ training/benchmarks/efficientnet/pytorch/train/utils.py | 2 +- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py index c46133a90..6c824c422 100755 --- a/training/benchmarks/driver/dist_pytorch.py +++ b/training/benchmarks/driver/dist_pytorch.py @@ -226,6 +226,7 @@ def format_step(step): s += "Validation Iteration: {} ".format(step[2]) return s + def is_dist_avail_and_initialized(): if not torch.distributed.is_available(): return False @@ -233,6 +234,7 @@ def is_dist_avail_and_initialized(): return False return True + class PyTorchDistributedDataParallel(DDP): def named_parameters(self, prefix: str = '', recurse: bool = True): diff --git a/training/benchmarks/efficientnet/pytorch/train/evaluator.py b/training/benchmarks/efficientnet/pytorch/train/evaluator.py index 71ab102b2..b0b0cbd79 100644 --- a/training/benchmarks/efficientnet/pytorch/train/evaluator.py +++ b/training/benchmarks/efficientnet/pytorch/train/evaluator.py @@ -3,10 +3,12 @@ from train import utils import os import sys + CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import dist_pytorch + class Evaluator: def __init__(self, args, dataloader): diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index ffc66df8b..7cefd0970 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -9,10 +9,12 @@ from train import utils import os import sys + CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import dist_pytorch + def convert_model(args, model: nn.Module) -> nn.Module: if dist_pytorch.is_dist_avail_and_initialized() and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) diff --git a/training/benchmarks/efficientnet/pytorch/train/utils.py b/training/benchmarks/efficientnet/pytorch/train/utils.py index 88c3779e3..d7bdb68cb 100644 --- a/training/benchmarks/efficientnet/pytorch/train/utils.py +++ b/training/benchmarks/efficientnet/pytorch/train/utils.py @@ -10,12 +10,12 @@ import torch import torch.distributed as dist + CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import dist_pytorch - class SmoothedValue: """Track a series of values and provide access to smoothed values over a window or the global series average. From a70db8df6d40ccc808ebd203a1272a55c10a8209 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 08:00:09 +0000 Subject: [PATCH 19/24] fix code style --- .../kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py | 4 ---- .../kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py | 4 ---- .../kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py | 4 ---- .../nvidia/efficientnet-pytorch/config/config_A100x1x1.py | 4 ---- .../nvidia/efficientnet-pytorch/config/config_A100x1x2.py | 4 ---- .../nvidia/efficientnet-pytorch/config/config_A100x1x4.py | 4 ---- 6 files changed, 24 deletions(-) delete mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py delete mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py delete mode 100644 training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py delete mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py delete mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py delete mode 100644 training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py deleted file mode 100644 index 0b0a0006f..000000000 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x1.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py deleted file mode 100644 index ee760dece..000000000 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x2.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 \ No newline at end of file diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py deleted file mode 100644 index 0b0a0006f..000000000 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x4.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py deleted file mode 100644 index 0b0a0006f..000000000 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py deleted file mode 100644 index 0b0a0006f..000000000 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x2.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py deleted file mode 100644 index 0b0a0006f..000000000 --- a/training/nvidia/efficientnet-pytorch/config/config_A100x1x4.py +++ /dev/null @@ -1,4 +0,0 @@ -from config_common import * - -train_batch_size = 128 -eval_batch_size = 128 From b672228e20f9f853522b2bebecda2e0892e134ff Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 29 May 2023 08:13:28 +0000 Subject: [PATCH 20/24] fix code style --- .../kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py index 0b0a0006f..fbaea0e5d 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py @@ -1,4 +1,4 @@ from config_common import * -train_batch_size = 128 -eval_batch_size = 128 +train_batch_size = 64 +eval_batch_size = 64 From 352f4a641b74cc07de1438cbfd49006e5e745105 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Mon, 5 Jun 2023 08:29:48 +0000 Subject: [PATCH 21/24] add stardard case readme --- training/benchmarks/efficientnet/pytorch/config/_base.py | 4 ++-- training/nvidia/efficientnet-pytorch/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py index c9e71c291..a383a00b0 100644 --- a/training/benchmarks/efficientnet/pytorch/config/_base.py +++ b/training/benchmarks/efficientnet/pytorch/config/_base.py @@ -122,8 +122,8 @@ seed: int = 41 -# Stop training after reaching this accuracy TODO -target_acc1: float = 84.228 +# Stop training after reaching this accuracy +target_acc1: float = 82.672 # Sample to begin performing eval. eval_iter_start_samples: int = 100 diff --git a/training/nvidia/efficientnet-pytorch/README.md b/training/nvidia/efficientnet-pytorch/README.md index 44c958efa..cf3aafa45 100644 --- a/training/nvidia/efficientnet-pytorch/README.md +++ b/training/nvidia/efficientnet-pytorch/README.md @@ -21,7 +21,7 @@ | 单机1卡 | config_A100x1x1 | | | | | | | 单机2卡 | config_A100x1x2 | | | | | | | 单机4卡 | config_A100x1x4 | | | | | | -| 单机8卡 | config_A100x1x8 | | | | | | +| 单机8卡 | config_A100x1x8 | 328383.49 | 82.672 | 82.672 | 750600 | 2340.6 | | 两机8卡 | config_A100x2x8 | | | | | | ### 许可证 From b6ba09332d2e7b3bc4156c2f33a9507842454ee6 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Tue, 6 Jun 2023 03:25:00 +0000 Subject: [PATCH 22/24] fix code style --- training/nvidia/efficientnet-pytorch/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/nvidia/efficientnet-pytorch/README.md b/training/nvidia/efficientnet-pytorch/README.md index cf3aafa45..9aa0942de 100644 --- a/training/nvidia/efficientnet-pytorch/README.md +++ b/training/nvidia/efficientnet-pytorch/README.md @@ -24,6 +24,8 @@ | 单机8卡 | config_A100x1x8 | 328383.49 | 82.672 | 82.672 | 750600 | 2340.6 | | 两机8卡 | config_A100x2x8 | | | | | | +[官方精度](https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py#L669)为84.228,按照[官方配置](https://github.com/pytorch/vision/blob/main/references/classification/README.md),训完得到的精度为82.672,后续排期优化 + ### 许可证 From 23f5f58b7b80587a01eae9b6b29febf834147d80 Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Tue, 18 Jul 2023 14:23:25 +0800 Subject: [PATCH 23/24] add efficientnet xpu case --- .../kunlunxin/efficientnet-pytorch/README.md | 7 +++++-- training/kunlunxin/efficientnet-pytorch/acc.png | Bin 0 -> 25813 bytes 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 training/kunlunxin/efficientnet-pytorch/acc.png diff --git a/training/kunlunxin/efficientnet-pytorch/README.md b/training/kunlunxin/efficientnet-pytorch/README.md index 1e0a9e617..4a3af61f8 100644 --- a/training/kunlunxin/efficientnet-pytorch/README.md +++ b/training/kunlunxin/efficientnet-pytorch/README.md @@ -20,14 +20,17 @@ ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | +| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能 (samples/s)| | -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | | 单机1卡 | config_R300x1x1 | | | | | | | 单机2卡 | config_R300x1x2 | | | | | | | 单机4卡 | config_R300x1x4 | | | | | | -| 单机8卡 | config_R300x1x8 | | | | | | +| 单机8卡 | config_R300x1x8 | | 82.672 | 72.666 | 868540 | | | 两机8卡 | config_R300x2x8 | | | | | | +### 收敛曲线 +![acc](acc.png) + ### 许可证 Apache 2.0 license。 diff --git a/training/kunlunxin/efficientnet-pytorch/acc.png b/training/kunlunxin/efficientnet-pytorch/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..8256f6909f25a4b9bc424723c6c0ec73bbcbb86a GIT binary patch literal 25813 zcmeFZWmHvN7%sXu2q+=lB@NQuC5;Hu-AH#g8xRCRy4wJxyE`PLK{_@a(j6Pvcky}7 zxo6yQ#{GGJUB++-D`vd&oo~(OS&{FRjna0nm+vl{}eB@0f8*uC`e0ccxN3hdwFYUKK7qpXIofK z;^N4Iah|t%eSFR*fu*Xd!`wn$eWzX(T>iPOtg+m|16w#nQ&pqVTvAd-LgGDEeyW5# zBb8`+`tTBzk`S3DFX?0Yu)on5cWe_KKeS1CxZmwau%F}Il=Rb2cR1@fQUvfbP?Zm2 zq^73EvBU$(Jl%*!$OA6Rr>TPQu(7dUaHB$iyVRz#;P=2?+F?e}XW;fnVk8ORZal~L z|NrIx4+NGzAVK&(_tszR?CdNeHnyK5S|XJOSd~wmJnt0WzO`9uNb&LUgW2eIiNhtQ3f@IVMWMMf0gq9sfGcBe{Kfd}=Om=1pM4?a z?^|>JcxnAz+$@7$*z>?-D227waaoTw76 zRAy8RW{-?Xp|n`m$9_b85yxq=p(rNKzb!xjLp!7ch)6vmIlMAPmlsA8# zHS#Nqfkp!TF&DguG6q>uy9;_j1HA<8Z%7RC@ff_m+WvK(Y`skj$LP73ZtXjr?OcV$ zIxL$?fKP~uFTD4hcMALOcgkD6o^!@X!L2!)IqCE@5vUaK^^W1>fs1&@Rxpmu}Zj}WuX!BXVxE&`}wV_ zFI=L#u2tg%lNc;3Om!Wx6J_1$CoOv{LuZOk@q!*X1|h);B})xHX1^87xmWU7GZ|y; zI~|?y-iat{D`Yq`(udHm8*|qV2=qvXo0b!LB&mq=it4#{w z%nft4*<*iTXsBoByvk)mR>NX#F+WF zaWi04{?4~OrEVnk;kC+@el)}F{eP>GW{BGM{_01XCiPsr^>V>$(wM`L7`(6{G z%#jx#869(*LiFX_g$o+!4H!WbQsJg#TH5w`z(#qzo9%FYyxp0qchG-)yzZ#+zjakp zR|n6v`f&UB`0zL`5kZj92uy0`trzQAO+@7y9TxfEm#s$Z%vzf8f9^yLm^9rB~ReXW?#I7Y#fSKi4lf8#Wfn z2FsJ|Y!Xap>LuOS=JerzRZj6rzo|lp8i$BcE>L8rZ_5&;tkODybl$$yRZw^l85xO3 zNC@khG}6@6w4bXCJ~*)d_U#*jhUsAo?5HJO(2X9j@Fmmx;eQ`+um$a-j3BY}63~*> zS_RlKocz3$!E~mK=xCAzR)QfMU+_sgdMzynRaP$WNPIgo!Zgr>RGB9Qe8N74G*?v6 zI?Z9Ev$e?!6W=2|gE`i#Yw`?y*Z>7o#d{8HJY(N)XoMq(8e5NrGLJg@dr(0-{sbM%)R zh-0M#8GUeBnD+R3gyQ$b|QJ7leWEb_B6;>p(r%>7~d`wH7v>3uJi-Pw8^ zWKS}@K+6jj=Vx?nOg=CMOD7^_<1u<89yx4PhwtVa&vG90*(Y%?#dp8HdGfG7_#EI! zE6qeW84{xc)&aI78!aqupU)@C{i582?&hGoA%J}&1y-KOJmu@*+}S%vO+A~hrMa}V1g%`kYx57Z*3Kg_EWMQ3hLT6qQu?$scu<_>>gS0dX3#7RdhR zqtu9J*`MDqt{@N_g60g53Vu#XdG^eO2F13J<%r|g!&|LBfz8E&Ed^%^Y`TZ6y8E0! zb|39y9OzDE=|E!*z6l)_Z3TUFUg@Vli1L#%CbG7Dzy1sRvGTVdvCG3U77-wTxByeYHi~HUK-hGla*GV;Mcm+(zcmn zDEjT6YGY>ny-dP*{D#{VVuFSYUk(|4(n4ITJ1?Dn-4_@>U(5eJ$2h_drnj=!p41!6 zuk$vmEDb!JjI|i?HvxgV#1m}N$74}eBUX+wKJSC2AO?IKt=hs=Q(5^v_7T)oeZ$?tAAK$KJ zZes6lhr=QjKcX4c9Q8P%B;Ct?9C==H!jKY&0r^70TU_<98j3kt{o$>$&>rpQs3@Pc zFhXx2tRdzw0fE#DmGc8&S1dY}koExH4c;}_9 z8Bmlhq4c9APD=#^iOH}E_y z;l#B0d*!1Cs*JHLX8DS_;k5jqwVC``=T95F)sUUmo(Kp22+mALu!^9uZ1!(R@z@{- zD%g6ah?1euxVK_7ad&$zylh(HE_H6r#g1#cJ@M9!?uu-&N(siI1ILLcOSu8PhT2iKT>@bo<&lXHfm41SGdzecP0#)flb<&4)x<= z_l*1s`Z~MAfI-`yB9_kk!UV{R`4bd}@9aR|UCVT48|yW?LU^M_*o^hV;QbC^QOlz=eU z!qSq$!H1f-SNXaw1j}#w~ z6MfD`rH~uG<+_4IdcZ}(F&V%g^yMama+!5rf!Z9#W8Pil0Age;S|6OZBM!19WjpLB zyd0&F=_QWU2^Hs%7i!vmh1v1*S)+dpzLEOkY}W|>Ji{j9Dqu2i)}z?Hfkfj0HaRur z(z1+)k3VgV#KOpU!fxgHO1TiXaLQ+VgPQt`G)M7fliu3nwY$L3-{Nc!G@fG=(S9AO zF?|fD*s3>1Ju@7~;xnM=cyJmtODGUC5=*!^ckf8a0Y$4jK9-h;9(l zJG8w?9&FO=oApp1^v^n|o!5tsrq`3t9(P12wd-Q>{GO%e&UM4j*~F{pf{vlxB*8HK zd4_J|f;S&GVj14YDApQLd$3p(2G8#wwH$8)i4+w-f9ild^DzM-;S`Vpt$1%>$i|Sp z=q%POv7RkorW8ZG@ykFixY`Yj{1{hfEHTJeDbqR3Tozz%sk=Y+Sh+*o677h!7*{+T zC>`Jdk+U;X`+3zEd7;A^Kxc`rUOZ-|pcGX6T5)dIp*Jaw>EisMv!$NuNB_+Lo8g-x z;&ao54)JLmD)*&~3u4uoQg>1@maIn!W2 zWzg_j_BQD^ZxCh;%y|ccfW`jgAAr=k@Et~{7VCfhyfg!8nJr{AXr8|fRl)|my^+n) z>bO*7=wc6_SfI?dA5pOK?Wzpk6JE{FEfLFhGfz#}TtM!`F}BMo6AQgW|5wB43 zP5hrEzb;f(}`kb6-a%e2ETW z+mZTSe=>xkA=ii16nG^WbkW>PwxZmgMP5SaY_-WuOO)IgX+fl-rlmQ;rxp=;5^t`r zkkAlUd_{B9p}sIl%S4iCS{Z0X{6f{UnzRpxxC;J_GQ)Y1E#q+n5lw2&`FYO*pm@9#4CvV8v*p>$M6kY#*1U)|!)8^fQm_!YDb+cA02p4tB%z>UB9 zI?Ob-li5o0(bamKY!nX7!LEjs%w^yS@=e>GBKn^aOQ+~9SFkdt=5P%S)tEh=XfLkyqGM<2xip?{HQl-L?|>6Y zB3@j*9W5{eS*B^+^roHHEXZ!+e1_Wxl=R+2N?I#blUy;YYMRj z=JY{MUb|?8@~U~760myz1wGa4gp@DqSny6`|#`CCOzC1^ZtU~pzZm$1e1BY84W6zBADJTXoUph zFkIKI$H5SN1lAL=Gk5w(p4DvsTKrbeM7Fe=o~PXRbMFsif6pEXd-XfLF2^lD6X{N4 zqu%RLXr!mU-E`ow-O$5U(3ZoF=$ykz0JQLCCw0-A{tSA+x1rFyDwqSaVc<^P>cX)L zxo^7I1jAn3s4c|3ArL-$s}SQcUaiU!#dl=cHz{&9l9B~w!KHCS_jkpW-RU^V)mp~f zug{s}yQ~}~jPEG~d9%!$m(Daw+%K6A)LV@VLdA^!o_fFZY?goE-GG-ucy9uCAFjoN z{(N)gzdvig^mwDrIBzx*mAXJ-2}Rq>{tACq((vkQ5wBjoH#bY5DU>%br;A<%F{3OH z8(l!GTko&El{AU4yqR45U<0XQZ{{1NpyyfBb(}5COG&kU2_2UX;dZKQ4Pypf z@a(ye>BYERp7cac+%9e9d{}@?R0FYv7Vp? z!c~ynO7C=uw%d9nEQMaL(nRvhv3{iq_;695dzRn!&vS^wV*Ny0%Vp3bkWdRFPe+ka zYp3)2@~(HnUo5Hqr7p5(y~^&;31uX!iko>seE?H-)z)G422nzvm*dPe5u~Vmnv>j zFMi_E!ZD)jZ6 z=rMgb%Hg#a@N_~sf?*Yuf=(tYG*Q`O1xLl3>#b^#MJ`g=Xr*Ob^$Md&#GN#9S(F_Y zBj^<-hIN|j`l{;pPG^FS@gAi^Qh-_t7A57_?!@=oe(^_1S68m0qM{{_S;IN|dJ;FA zkxc&BPZYipHDh9#W8(0vy{W?U2{}qI7CHGlXnTOqNf*lC=%^*^>ToF#b{OpY7ROfU zyb_uB&}v7|yP47=_ZeYZ6SR)Nfn{@ghhohIrrznaUA*u8GV9W~g!d>F^Q%H^#_WSF z&dpV=d+N4nyC1O%$p-Oey3|et9k23omIq%blx0J0bb|oKLp}UA040iFU6My7IY;8C z`i6*G{2F07ZD{Dti2j{Q6Ff*m^&& zu*-dxN`C*)(tuGTHLmmeSC<|2#?BQ8VaKwBqyoN_P`(@dL>cz3%+;s@k=@IN2>BhU zP8-@Gw{6Tf`%H?UJSXhwnX*pZrq>d)1>%W%V z#zbjpX-|N<8JEQX7W|aZ*gw9?ynnLBN@bzmVMq#7w0E^D6!FxPHp$+|`ir&b7T>Pd zp`;4T^xTo8PiRn4uyX^Gd8IyQQU9E2D+>Cm{#u6jCHun_E=5#UGzPA5kLTR5E#s2O zNM^$t>CZWv?8YrW&`PmbWFpA{b(uR7h*IE_s;*I=PrTc0>g{uB3Ep^?Yu>^d2SizS z9PP(HI1RJ?V`Y6Op<}Xztqo03{*0C{J5|K={LSp&x%d)un5tOjYe^xJhw7)v*N{Uu z5F^h`_zDICSh$agM(Z?+nxYc#jnzh)`)QDhnY z7W3Zlh0B)+Br)1tK7tLKQZvp-C;Lc?8bZ*gCEqC%>3Z*IJe{0@2H zo<2T;*Gq1gsm6g#LB>G%2NdvUBr7J`17HtYniENhuJ7}av+GpSYNRI#esXTjPSNanMUX`*HFNRZWRGEns7RINDI5bxo(salNU`Ti`?Q z>aV`Acw_KM;||&5zUU{SVvdqL92DI_PNiFY_VuiQE`H+v{tnk$uNTtmA5xA53eb;x zZL%lR>!@>$&)+uP_Y(9T#e^49BAHZGu!`#QcMF&!AFK@UvZ{brZ=?LG0<8phuI&E? zjZMTnU&ueU$+y#RQ@a<~nSc{JOD|X_lZ>%1D*m}+P*Y(9{SE|zt=|%5e|!JA+vn< zOhr%_4fm$+lux!uVZJ{V*K5HLC=OQkn>D`WYU+j&|jNKU>sH5$d<8E1lMlOnm!T@DSM`q4io_|j~(o`-C`&EodX@Bw|Df7b(P* z64oDE!Z^%YgJ=A?GKob}5u_{XcUkV7iOYIE2+M)*Pm81*jo2xdxw5zq$X6nm=y@v_ zGpj)X(SSvPI(DgE9V(E|kL8Fy9R*p>Rhs4vCEVFhd7aE3a#-M4TvlEA6r0>xGv9;n z@2HGB{Q2V%8C?B`paCm0fu&am%;UlH>T>L#zlJJ>JKy%i4y+ia(2Jz z??-#KgT!AkYJ}M&3`hK3F0y7kPR8+4bz?u|v_cV~Qu@!_g-`->290GOa`7KSdK=aC z5S-^D=Yw1eBAy|#B6;Dx@nC9{7=J*;K0@_AG>{*FxaCwKpy?p;=ZJw(Qvmn0D@t${ zRGm|%4^QUBG^%*JwkW5&Za306;CFK$o*3DH$h;D)73v)1juxYNyv9vkd6+&xl*jWG z|CG+|nt7|CMD>29TQ=zHSR>)h2RE=v$DNSdTW-dhU#I-$1pSLz310W2_PnHAR@0u} z4zZWGTXAzAvc}ZiX29bdqg^MC?*TPsbpTulFItUZA{ zyU34BtVlrcqJampi#^g7)odaqJB1af>s}!bw|3w8j*B|q*PZmd)!K)i?V+^GTJR+A zyxS-|L%#4C%doX>NtagpNa`0#Pt-C~$y&|Cl;a?P&aPgfVy?7VifCk;${L0TWmAkW zH<_05-)jx_{+?h( z!_u8Yt$=53?4g2~I&QnX$Cwr94SL6S23;wRv#NaiHVcQhTIWMBPS5gst;V_UbMmhy zSgQE+vO;DJyGm<^8ZR;Ueth|yq_p;ceAL*Cq zehH*;ctaKW1N457o$&E0f3^gIhZUphFf?CEZ(BU<_`5g2DRNQp<@l}``8NmgkOq>B zE0Im7TeA<~DE`CkJIS)hYPmlmI;}`6jh!UA!Z3=G#rl1unX0xjpCWQbCRVDiu4tM= zD^#}qu`{Ozb|iSkpG}TJ{TuTj%mzaz>B#5)` zq8KaVPBFi7=%ah6E<2hiXp>hy4mRJjr>LM%=yRbFa9SmfZkW!39?37cX~`Xl)tDc; zZ+(t<7KdO$UbR?85dQfy>bLKh=Go&MTb}xfv$E3C_k-!CrPa@k$WRL~=}UW~9Xr40 zR?s(8hL{?XaR2;(-WSdMiVgq#LXN|p&oiZ|(Q`rV^^%cVt8inf9}BIenueD0QG&{@ zju5842Y#|F&x9o%U9OnKo1OF0{DJmLtsf>Z#VgOan;y!}@TM7Y()zIMlE!Xcd`QAY zyNJ0`@0?KlBeppA;$k+#V_QO|^^Pjy6+l^TcT0)18WrpD@dy~Cocswy@RCIS$GKROChSEslBn_F9^ z4U2=ne@_N1l+a>+BQ?=#`Dp+r=RA7j?Udzl@NKWu$yo%EqUudraO`}#x=oci9|$*^ zw8?Yn*<*tV?pRPuaD07*Pp@einw{AyF9!9OfD!|P&GzKN!`MUVy0*x59~pG7Z{V>g@`6Px4Z@ospr$bVk#he$K#D zDp{;(x2&Tn($g*t2LlC>&&;B&Zd<1p_E(vz6~bClYV>nK=isu}_W%lF`RjHfCH9vhES!s6OlN{K8SjX2 zZ<#>M0lG_PqxAKf%aVjJ%bYe}5dkuRV5aY}R<|=T?zUO0#zrjMg5jl`M0|u=?+x~- zCN0OSPLCfqU@eiK%lLgP;K465fjJJCs)0&@WAC{5cu8B^*A%fkV;xk{f@Cl4>;Amc zW-raepfqT(>VcGO>nGl`d6TraVlI7{`Kn1daR%xrusotB5D1Sqhou+d7=4x0WScJd zB3#n%?6BGLgS5p)qdf_}pmk>P_lrA=?iWJRLahfh#KfwAsoNsz^w_{^y$cG`dF(6*oONn`^9#>O|I?5XHWX0)5d8U4r~8E6MkEC;R1@!6c*b4AEAMbcbI zPnDBx!t+eYhT$I%w#UmN=_6tIkh<`Lz%jbd5%SsUAQ5B$_HtpL`KNYs3u>QMc^1$K z(QVLK{IXzvWDsV0=luRqeCor33kiitCgEw61I_U~C4Ir~FyhqFRC0>dgR8*|rL$ell-dCZMwspbXa*lDcYQ>zyglnkWqj9LA z?KQ)AY^O7KMc-zm2bA~f2!xSGLFr5v6|>SxRRklARlGCC3ProjW0*inMPj&^*u$5e zTloEHJH??n9L5>!s=8d$X5>fk9Nr^!JzK!`BsE zjDtNdMPQF=2M>KN8*%k>*CMcbvlSF~y@zie!6V#d0kvj|M}s1RQ#`L>_*xIBoti#U zy21{`FO8i(kEBL+$!O%hy`61U1S+1@)3d>R_-~tdCQ;+-2^fwE)ha}q==Qx*ssdJ( zga}3;-$ur7Tc!MtG4v^&y=szl+S8mT+sPGl(QS=8_a8xN?((T)jzn)gdvxj#%JWpN zs*G#2q@@9P-&LZv#$c}%bZzUi%*++abv(9tb&DjS7T+V<*w|v{xQB|Fv>xK_2tPc0 zttf9inr5DH<7xoTdDy~gQ2*$fV}Hf#LfVm6P5T0}W%(90pvr<6>AuC6)@|0WX?P;U zk$p;Iy>iUTiUBt;hkqPi~Z9e-H0EJ0N=$e|$9L8gh+_@Q{9gbmf|IDEzqPowL(cc&_l9E^C}G z*{Z{6ERoQ2Cz|ccHGA_nED9SmBfBH%J~(>hJG&hKU-uFG@Sp_}d3^^$qRO4sbF1G1 zMZA6JQ7;iOD!+%~%v2bS%DG%`a8uv@@gmFBtTnaYWVon%s6IBBQl>Sc#-PNP#+JhW zP$YVaUrO8c(;}yWCUY#UFRcGVSHZ1DhZ}J`T@z5L5~e2T@5M;cSiCvdK~T0{jyumu zADdZD9&Y6NRNHHvU`8fD?V#b)8YCigRW?!SN>9rhx*sHzAxuw=<9Ttd4?g+1i-`B} zip`iH5rMxn(HF13(D^h%n?fqi(LJ^A1o+SF zhDUeIqW#a~@9lbuvjj9wd_Gf4&w_0RT*=piW4g-@jqL$MW@K zDfK3WbZ2+c1r5#Mgg>I7ZP4JYMIu>6YL4k3wF- z8bCYn9v)^GO^SAY7v#L{uTJ*f_V1aR&L+z2NO+F;c~~bQ+LfX~{MU>6t+^m>lOC&< zkdmxi2Qdc8b z+3rfv#Q%m2N`mOtR>fsPA0vm9?YqK6Cp$G|!$jkoO1wCxAF=x7N-gHkQtnsnMi^l; z5AEXRFyplGGFFWOvq0G^Z>4~S-(tQA`tvCL@gd-yZ^_YCKrXe zY?{k|u5rv0X@`e3-!0TOdNYOSFudzLH1FKl)^nqt#Qgx83d~s)e-c;HX}LK$)2TgO zMeOz?kT2-f+5Y-dq`o&@LTG})*SKHa;(gS#^K>j`+!3gh&J{LOrcXI}$s4~(rNfK1 zB0vctx1=U)aWYlvWvP$vj%_Ds-SsAdWQwG!oA9lE=j2s`_2FRJSG^8H40z%xCxOC-3!02JhRgRG><` z9V2{XG7v{qYS5JMlX2vQ*B=p0j(vhoz2xuls>8xDEs>lCAiTw?PQ;70&uBtRy#)@! z99E*yrbp=obZzIs6{LFt$iJe1{a6lQVBi2aa$tDa;_siL$AJ0y`NxWPQ=6y#lsi|8 zPF6Q3YoOM%-^@Sq^EutNmDn0rL(Glb7*Fq%4BkSiEw7PLhmoIxdlj=I`ab<5afS(X z5Vo?w8nSOMU!I&6;!PFv=7ULg+zOq<^izlRafecVz^UVtNOyAl*ns52{dgrC_;obM zzb71Hb7zO!dc57L9|so~C{bgc?~IAB@*1)c0*D7*qw0>ZeJ{?NswV%Nt`<=1hG_8b zk15hGvn{g6|NaSJ@M;5@ksd9M{+_?S?9zL}?r;A5Xb&>z2yBD1Dv@j^AbQME3Sd!2~o!PIUd5l_iRhS zt0=~HcXhTAWwX{q9XM>hXufdr^~tYL4_@}T>DjwNnwKONPYConb^sF=AV%5dd$}Hr zQo7aM>~%c)8K2o3z!wvX`d(lgx~=y{Dy4CBbr`CteZy+}1lZz=%ro8FL#6d%($!97 zyPk{|r}PSZuiBNV->GxGcRQ7GrL)I%V)zT$rbv<)XIR+SMYvR?AZ{)$02(6oCE#$e zVJF*b1^h}>lpGQgg22Vab-#m$hc{lP-+*W|wdcO>F0@~;>U%V8Mft&^Yr~!Qx4>w; zi=$3pdt*k640-Ec&}RIPpF;lLI{ zJ-SI9uncE@REtPUEnZHR!)aXraHa{UtUm1>K(np>{8W8(b8OzD|2;I+bbmGkfQP;X z!5@b+c;C8#9>KA*{U0h=3Tlf`68d1M;N{X;)@2&*j;r~eXuXsEK@aLgdS zweaBsIp@QKX8Nu+Yib)MQeFdc`6LN`|I&74A^YiEpa(Sdkx(paC};#e*?-FnIx59I zmChQ(&*tJlr|F!2YX7SmC@EAg-Kxw(q|^;L^X%-_Yp}!GYUq;E%bjBMyv+}#SOl=| zf^Psk`+M=7WGn^V)bX;X9%pYj zvy4zXUbBOwWA7+`cs%n$>McZSeDl;8&eK_f@of2r%e%`T_;Bg!@93TtwJ+Vtz#}mY z?J9h0H|Rv}UY&n&fW8(0Dh_X{K}$@`wk^l% za0JrKD6ZqjM8|M>0G9%UBN2blGM?+%>a^^l!Dmlp^Dkvw>$`ISU+`QNltj`>+OZsD z++3_l$rQ zd=uFWK~lm!PDp*oq{C>DQcJ0eUqYxI&IDSQS4A^PdtK~&-g{7YUtRWbyWBWkqcKHs zs!P))_I&+A&suiX``URD5HLk*Dj8OYsqNi5YpZ}0BHGMxo?MqO$X|w?P1KJgocm?Q zRkgCxF;3@eV1xZ&`-#M%R0h=z501T7r=M!_7Xu-(k*R3x%g($I)CTIq17e&IO<+;w z4c~*%)iXUc$0z2j!#bX$pg%IG9wHwLcTr$CsG$8PTs5mdNMr?khsAlJ4vBT~$%p}L znD&xSQU?f6;Z$CAu@1kI-8rtFbzY1Hv&sW=jsXxT9JL?3ki>9>WbErbt@AzsUFoQT z(4{ibm^n@+j98zDQ>F@F4s-{$(!8RoP@UIQ-7hfoQJQI=Ry`aK)H7aSi0{rlqf1(J z_^O*?E(BVtP@_Irf3hj)s{buMSv{fvQ3fk>%2~Shzz#mc{qz>TtugBz;7~ z1mGT-IRFCLZkb-(wJ!(1x?8fMdd=C14^Tx0stSOY2-qKEtb5)iv!+DIIv^Ov0B<~l zAeaypx%)BcUyFi{YRgTgIs$>!b^h@TOSXV@L5Z|t`hgVsO$AZ{3^ZOM65SoFc3n@P zdA7xlBm%qiHL5H_>9n4t>C^;zkN0=->NF3|l-OHvlksG%0^fQW73O1t1c1rA0zQ0+%*AvaDFw-FdhlReJuY zMK$3Cq>h9Bh7W|`elQmgoR8lcLjz%D@VT4QjavZxnkE(?GaO=K1z+F0Kv>lO8%7pJv9GrO(I$gsr8e`~?}#frUExIi zXSMWz*j}{8If4ypEHMCfQ-MqrK{N^PXW&>G@qfppF8(h}YGB^)G7ms+7+Jig=v~Lo zgHDN8qtWbDpXTCBwTLiRs6uJvyGQqQ3=32;8-Wl)VC^HW4uD*~y}cD6dzp}sVC(+` z_W}MyX)E~O@U6x7dZ$w#t^kX56nQ563o}UsM+>;S_^MlD*?Kvn zFD)lWIl;^DKk%dabTds``emRwQO}(m&>UAK($liMf4;}I3RP58{00DrN)205E-u^w z&@Di);2mXRWE5EC4)uGNB@hpgHsH+s{0%WBVnX8E1mN-R`a(_h|MtWiBBajhYAzt0 zl#`W37WKR8b!_1FyRwI$0?xO}=iJiO)%B$>M&Xpgn>TO1Xjh)yDrp_qfR)${Z4Q+j zC_6ttDUZ%ad%}}1{3`IolykRHuMv_XTpTkklI6VO6!7n5p;mIKs`L~ zxW{GwXaj*9Ej_)OfdQGua@W3R66eLKle1sIumg9tWu}z`{FGz&NFo(T#&Ddw?yg}v zsF_xvp25wEnf0qXfhGrn$Y7?O5($9)<~5HBhBd#KN`!u))E5>;$NZ;UfK!ng zE->5zT|~0?)<0NP%Dh>eLNrKzvJ$ue#;(S+aQZ=5z%se+`}`?ZBH2IT&jABTYh#5R z(v4o^D{L3J~;9H2JLZ~c8 zJnfL;#9S83$bBDGfKFdZfH^*vE*c?AXj`?{NsYmQCI-O3@Z#$H5KKtjjzMuT;8NiD zLpmfG!Q{Da%lGK#tri-k^`1OO{|qiIBQWeZUZfATob_0HB?wjsfE969c-na%0pUn1 zB!4>0XKS9D(HAE1Ld<-;e2Nnuld#_OuI#|?g!uJ@Nv@w8a6Uc%2Y9~sj(i67)iU)J zI>-j_mcV5czzS3I9lvHkVAstN1#^7Ym)L)@tE>4>*N=+xuw|rIk@RV4`7r)<`{St@ zU`;HD!e(Usk>{up^y8k^gB4hhDi>{}JwHT~HyfPSiw=kH2`hnq z?!iDml}+QQ5T=#vnU_E$08E$?&@XVy?`5K!2sgk~|3rFX13vAyjF%kz8clDgPnnf~*o|H((fv?;To4-nlyQxpWSJ9EzOloX3a-0RC4 zJb!qg|L1pcF7QD6P3wx71uqQ0c+A8UwEYhuh$40RmOGC(*tE*{Lja`sX<~l ztD!{UU6-@Q$53re&0mc!8}bSYRBUW9@icO3@84rJQpdhm4a+3x<6AF!=7-K>OSzly zq__iN{vd-|YwR(Rb9BRI_oAwx`UgP^e$zi7Djb#dhg z3c!)EiN}4P-o6QQ2eB|Sn*(vS2!I0Nu^#X2=xA(A1MEn!9A!Wf7!i3KK+qH@Xa5Gc zrTrmDXb3u|Yd}KTK}=9uhqR z!%p6;hfuaa_NY+VgEPLSxBNd9CJT>4hn$|Cx&vu`U;Esg4#1GOhxPv-90J+h)gfs_ zLadI958Vsd{5tGpqB*lCsTw^W1bH+5AC105s>hrN{x2BcA0!q)+dsXsM_A1gbhEXW zlaYC1MA56{kfnv#2G3fVX%Pe?`d<;*V_*H-8J7~{7zo5b=<-yDxLSmdOJTYIMHb|| zZ>x1BOEOuy$dIoy;=qCkN=p)gIv(%rl*Jz|9?9XtTbb>%n!X|ZU?Yh_l{DGk}6QfJKL|ENd zCX@f!t;qQ>_Y;*5Q{)8X)bYSKF)`u3HyOs>eiM?YEW|Xvg_L}lugNpPraj?E&Cz)cZGU7?{)Ye=c5}c_`wz&kmz?0;VQ9Lo-Y%fms73%j|EdhJl+EOS zN~He_shp$ZQ6LOJZ2t!KH^O_>lLe!{lZ^j8VGqbmyWBt_mO}Ju5!&YGrB!Y~3`{n0 z%&PjkJ5=Ne*8u3c6uvune9s-Q2$j%BAV9<>CQgcFL6$ZR%e2kC%_NZcXY#(jXa)&* zpP119Q`#V7;P|*<#g`eNirIpg(b$maG(oiUugl2)N2_@5?`)qa&I~!;)FT~ElO@{h+r#O& zyJsP?fFXbBrw^JEMyUm)id|$JQjX;Ocz^2;)a`ie=ddzI0Jr-L7@gJR4+R6i!#Wnj zW}b06HhtbqAW<><*56+Qgh9$j%g&Aou^h>$vYjrLNFmTzSDeAJ zxp5Z8q57(jH}qtxd}(*}K*An>dt=2PnDotHRqFqnz6}mGEow#ggn&D z^@yu6W8J5jRrSs)n5CszE=N1oQFRr3Rg zX!Hi8{hz$v0dD~8h?@?_>0*3fFEWRPg#p$*ug`kbdHXVbl?_E$T9)bQdl&<2fUbDI z8XO!9W~2HExZ}uQdI+NwUO9JrF=HH+763&f=txt{1rtX_a4{Mzk`$r(P4RQm5tgMV*E30xsd z)cxyU6;%2Obx8D4@|7+4X$gDsK{}Yp8a@%ri%16=|E+4z_oquO#gUr-xp^`wz?1%5 zI$WMb`1x;x{WqYd61}nin#5KIn*6sx<)naN0X)xM8x4KZf95|87!_4+i9a;F83KrR zlOxO?l*RP{6>y)x*Cm2KetZT5@i&`^D);>*#l=MYj$}+sOufH;ak_5GqoSe$9`Gx- zh!%*4h74=(x${%brUnJ#Z~KK>CQPxLS8r2TOaVkHIuQqCt>@GE`ItB&#}hI0W)%@2 zW{xg3xeX2tnfFFugUj#6fDH3s1r?;!()<)BZl1^>H$$3T0p8Kf+~bZod@zYgT?YV7 z1Nq7pK-gIY;sdKr2pFW4EtGt9xcF3|`{x8O2c1wLL>bSXylRZZKt1R&Wp@7qK#WOB z1<>`0JqdJgPrOCYNq7c=lz{|c;P-DcK;2Q=VurT9{rK^tZ(oPmO%t{Q;<>m7>MFp; zH{xDGhEmxX*w|hcP7UH~8+1hza|0zz#Q!^G%gRkjkqr!dl&s*lnWP2Ezz-F}!^1*> zC@^L(A6}&Br)H9YW&kVoVZQnUi$P=DQ&o53yIhUy)^D2j6`7P;X6tFtQsr*duA3mH-o?T8%;8=HtEL#CuCqLiYLdCHclB4ay6?M$gy zGG>USK}kdzOS>Wpkz}Y)NkW-f=YH4npMU@VIp;s;{9V`H*Tq`j_r2fyzR&YM&wW47 zZGo5C$~*2Y++nF&fwxgoJ~x_1_kuRS`=e!m0ckI|Eq3nIMV`CqX1||?w?@hl>oGB! z*S=*}ow|vemkIY<)I9{<5i143zvOAR{+_3?TJ)sl-&>r_+y&oGc`WM2kc2e+WGpQ$ zNjnB<#X#4)_U!lFcs-$wN7pQqGG3*wPQ@0vaj#Dp+vNA-3lw#!&j!R6Tw(5SexSX@ zhr6CX5CqV8^KM4aSZIo zhsnh$dqk*qZOJRW4W|Hm5_N(;l?7;GiqzdkaPv&e$^o{o64xTMUmfBbxD zVKMv8Oop9v;bkPU&08M2EyWWeSN?(_=rcab+8{$dQt4y)+lwD2->tA-sL#AN(wc#| z^H1G(u#v>YUI`QVx+w*B>uG)9;EMXwBJJ zWMd%*G?pqww7v;(U2|pN;*EW#$3E8_N0kcovJ>jtfO&B9M+uHn<@ahfUX|>CnSkz3y2-bEDZ`M9y6F z=A{$MFh%?M0P^&YdZU@V>E=NljU%5I=!S%7^F*A4Zxrs&$dF z9@9{9%%d~fAr|l+&)iFPkHl+ekPccb0aj{Ma(lu}C z3^uCDn|~Hud{WL6BEAF1Jb}hBP$oCL@4GalGLwzLu3~gXZaCa&Y+~~KmcqEH+plWm zNR|g8r{>wAW)nJlq%&|x>)reJvkgCfk&@y^e~G=lxWnUUr|R&9;ELj7V=Gl!tu~vO zm^4OFPHQz)x-xIU4j*;Zn95rDL^{pNfezmF-iPx>6#p-`9v8g!z#idGc=jlE^=ALi zc~_&ovmi{SuSZ{e0#^XIWl_!sy$ zscNTsbyxc^32lbnlVNZUwsD(QHpPGN7C_$(co}t{?fgIV9ghf6T8jj86YSnj?5iBXf%MAG2fh?pMzFxRCfO!`QJdFiAx3ng zLF~851<^YL1Tlc2isp_@MZPWMj+EcdbbQgUqMhc05|?Bt;#X9-89s{qu6xA)j54DB z%gmjX3Crc(V?G_}aG0`(=?z5S4Jvzwr`T4RFasbm08XnGZTlqyLb~hu?)=R+rPH=9 znl~}>8f-V*(?XX+hkTeI&N^8+N~y;SmK*q%z8-u=aKTmf zMiIWByY=u3$)Cu8PQMXe@a_!r=*Yy;!Mnl}rofmV_mh~5f^i|;G;5|m2i=UDYBf;Y zdB_~+Ex{aDrMggl+r>PW2xfAED`q)G;>t2sE39Q?I4j1s>g8^pFfFElTXZU!-*5X_ z8inG2YGVvTQij5z9@A=Y;U;TXZplN+KH{D;fUEtveBbd%Wm%TUXd`aMQO3{z6Xm28 zcB2K264}Vex$@Y#!#3T>t=>G7Dxh1>MuOO))sAf4tp`Kv2{H?R2C9>)?D9&@Gf9f{ zq6!HKk^IKBan-A9mdg_D>AKr+LqxLI8HR1J){)H?{WL)Qokl)9;nbW%jtnPe`V9$E zp;wQdy@P|$I!7*$0gp>w>g?UKYumM++Z&RTILKN=+c3P~<_I1q1X`S$&Ph_p64Z*j zLN=n!gtHrM+N*(xU_3rm?1C$wnbCI!Y#T0L62XiLK9cxF>~(=13DHQPSFu(}DNZ|h zN-=b9hO+l=1#Lu}LW?qt^kW9+3m3u@Oc;!ej%`rJj1b!@^)s!x9@WZUsEo z#)#IV4gTQ{)wlXsNK_xFgB>7HI%ozyD8l8Q=as46%@HQQ-Iq8I!$Gds6o1I8IVAOM ziuw6;)r0ZQXZSC0alS@S^k*N%%0>o+U}jW3B2+n5y?)0^MEl~I%owF;GmB!Rc|g!Z zCfHhJgdZ>BlvlY{bVkB~VZ~YCbvSNuA5-z(!)n{4%p5j$vIR9iNew9#^GlZ)Un)*e zt2xdwZ-0NV+Ac*%zpSj7y)olVI+Ba|TO}muXLhtlnsB?yV-EN=$j$CO+bj-X_9PRR zT}_5{PwwoQ^+A6J*%~t_D;@?dFM}i-Pk{kvi7ry9IywUG5#$ zbUCx#l~^jnL7=+!Tg{ayq6YS1a=0vR=M~&b+|&Y9N2YturNwBfW${`F;B2K_|TQ#mG}=WJbF$00q!t)DfLPM`5VfR=iCB7?0Ah~ z5m{qoRxAn^Qv(0;i#+$6F$=CyKAtmO$VlDzx8q5B?!7U3nMzr4F_+=+iwsy44bAuL z<#*Y`(j9qd1;Z^$Ucl}0Xy>h`H(a^Hj|WCJi?Ua)Dk}5hq&PkNU0$4dOp#HTJ=%De zvQr?BfrjN-W27|qgyj^uP zr8Ek~uGM}fB(Zea7>gIY)%wGo$+@{Q5V)&BRR_>znYVF^ zbS?GSA`9kMTg6U<(iJqiL&`X@()!rjM*xg;TW?qskM^ecFt4liww70v*r%?Hqu6k8 z(Ex?WnE4Q;J|IE>?<&!^zr)ygF<>@(-2siFJi)L#Qu3_(!9GWZ=M*dn^DMsni_s-T z_*c62<@(z6O(&sw6GD+=+kVYl*RWBtky!1tIGeR@`j?J-%YeM>-(T%R!OnH)Oc0@1G``H+Ns|tn*0z7dtt?C$6NNTkQbP4uXDH5LaX4TwuPVAno?eYx14``J=yCubkZ#e`SeRv2zlK779?zf z6Qs(y@WmasCXHDksCL@T8Z9a14E*#@*L~ewa(8BMdFriwB;+{9fUAgnYvrn5KaG1R z^7GP2fkdaRAMbxYY|D7nm1)}-!D>OwSexIOiBA0siP%l9|K0BY+zeUGD7zIZHlQ#@ ziMFIY2o9tJO9cfV1IPzo%w_wi=sixGzB=WGECl}GZ=i2DvNtx3Aq{fWM7#IPLK!AXs$#jP%J*unH%P%<>$xr9#EH!@DVf7p8PPY z%?nMDEm6Lor-Kuju-34tdon;EK&Dk(f8H)#}_7A&^vviX=7@mZX zi>mv|n?vJvpk+wq1gPw9ll^J$hy7P*`YXbW=6vVAj8_i)Sm)8E2ok?5cw7XxS-1AV zcYcIwE~`3U@~K~#rg*{qv`lDfesA17=aAtg21VJYkiGd6`K85ueKpr-c%v| z6R|*NXo=MkZ{wk}C~)urZ;nuT+Nt&{0+p-Y9WAp&woJ6fh6-PMAb5<#Yj76aE-iMh z7Zz6aG{z`bsJQ=xy~knu52;8zz5(NfY8U9q-ZF4ztfkMm7RI4H>mCZzy>+@ zR>W-hXE$v*F&o?_m#n?0NPb--`=8nyk9~bO@Z;4)0W0cbhheiExwO*O|4(7ciC-sH zv`u&Ne79^1TmwJd+}teuSVBT#n~4ceUZ8Kg+f^m|hg={XEQ>uw2-QO3ctEW1T;UzT z-zI~sLT7)FB25gOW#!AEO+Zdrbk+vOw5W2H@=^q?{rCv+0yjd~ln#LUXsA95(p8Wv zWtvbP_?JtgiGvXqQg%iyDd#2m0|I-~PMv(q$(7ciXN^j@u%9*ja_IvK)}@VjH57r? z9yt79i=Uq#MK0df3rWL71*}OKICqBAvrF(*5hvI7qQTUF>>k*3tOBYhZgGd{y!hM6qUCxj&txMUc+UO+W%UK4V&yVXe z8Hu3RQn$8()i^k&qxxiJWKis?rE4w{xNX}uxct*Fo%5-w(en(edPa!pSb4F|)S9fT zSBF&9a!X5-s;jGG%T=K!k`BfVg*pOU*QdQ8ZZz5xAvOZc11HCB`rPJ8EkjdNo$h|8 zp<7cQY{z?a5xlVNd_TIBd%UX0tC40dcYN&2ldNs=AE8r9pKJ)N{#xi&FzvM}W6CmD z4k6M*d<*n8=)%FS{?SQDC?Ab-5!JJ{v=kb6bsejF64(@l4pbFScA7mDd<) ztA7{hXn@*{^m1PxAC#Kg!!YK2%V$Q6g0$1rCk!UC;8rPU=W zorIq{|K-aU;m5qa|F^L&0jXUXyE@lXPSB;*zlGoi$QSMza%a)`32Bvk95(btam6yNomXppBu}Uq^ND zY%8crRy~}mj=-uAGuj90UMgzx~EUqRbb4%pe&qCOrojr@yohtF*a^)dH4K2 znfZQtnJr}tQOV4{m)S=3;323vZb_gy(y13NIF=*R=|j1;BafJNN8R)18|38VW?R(; z?=;nYeOq=Joap+B*|7tK;AIJlw<5RLPk-*CHe!J!IKfQy(77OA)ROl>tix{XTX6>NvkAziCMLQ*rz$`0xcl2>y7{hMJv#jjA>_cThZ#{%a`J9SkU!x3 zJW1u5M{{nRy7#+>y7#jsMqI}2aBBA<5j%wZt_{)M$w`1+RO~24Q_W$0x{6+pc8AY1 z`982Q^r>`)Z>+i!gM7ZY(c(w`rP}A7ccgi@3VH=fU1gfXU-@!sQ0h* zh2RnD{*DrUMHIz$laQ_g<%YN5R8VQHqfvBq-=Y9K1vA zIVB?_Berjz(H&xMBhdKZ`>ngm8DWKS*;}-*y{fza<@U3&X|XF)A!0n==U9A6uYfO4 z@3<{e;68Yb`i~!78Czh9YKQG{%~XdO+$!yD1j_p^gZ#ZhO-}Z}ek#Ov?<28AVM$3z z?GZ#0C6F$z2e(P(iKt=BHA)>qCzR{IO{_`m6YGi~Tk;niYol#Nsz{7cTKS0?o&Fm# z`^{I>ef!V`9df7$ec{U(3DG~u^-E^KAfOeg57dBA_k-;h0h9dP)~YoPshk(tMuL#+ z{rIMss6yv@ML--~=pb3E@kbCA`|%+Ysp|EKOHQtX>1=}8 zhSCvY9K&f?f+~6S-V3{lrY$+_DTMg_t^ERSac1=S#u$tYvN${_)K{jDclkB#2!gB* z34gBHJV*jzHG&`u09X2~d!n)n&(&lf?t?+Y>^KG%bVHzvM4o&0?&V*;-2CyeP5K`i z2pp|f+g@}iFE_Uv;<>*lC_JYh@4dLsqS!Nqe!uH<4bF6OYT)%)EAcPuwA;d;!UP%_Lus7dPmogkYEc#ZI0NFz@GpV#+^{m)z7+n;o?P>Z)A1h z&xIcY6OUTzpM?OM_mgJX+pqL?#l@A!wRfSCkI)=?TO)WxkR5n*-lu}cXz_P)cAjd0 z{L~5h0JNeC(Ie50lvNihN>tUuHc~)@$kbWdsr<2QUuEs z2PWhF=;X()3LT*Ob+`gNAZfLzgrh+Y#O>zh)^YUBZzMcHuf)$`KYNy(=zI|9i`di~ zq=+)y-5g}>Mleq-jS%-EXcIQVQ!M``B+{?%xGh5>-G6`d`J(0nf_&<00FCmSVasR> ztGGYJds|e5XyFir(>RHfy`_tN8b!fAZ;!OzLYxN8_gci}VKqIJbb}aCQWV<}(4W$Q z7{bxf(QD|zxX+Vc5+*`x!6}#E!+W1;`E5mq1VM~+93QxT!oLjk5#hrmJ-&!!lm(7^ zqA-^fT^m+@{3lapQtSVR+=}GP|J$G6m=n8wy@{{AhyN4)#0X{E78Cs&dbYp+2j~Fq A5C8xG literal 0 HcmV?d00001 From b60a8aa9cfe2e9d597326c4c3cb92366ef1ed70d Mon Sep 17 00:00:00 2001 From: Feilei Du Date: Tue, 18 Jul 2023 06:28:34 +0000 Subject: [PATCH 24/24] add efficientnet xpu case --- .../pytorch/train/trainer_adapter.py | 23 +++++++++++-------- .../config/config_R300x1x8.py | 3 ++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py index 7cefd0970..2bd8e0f07 100644 --- a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py @@ -88,17 +88,20 @@ def create_grad_scaler(args): def backward(args, step: int, epoch: int, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, scaler): - optimizer.zero_grad() if scaler is not None: scaler.scale(loss).backward() - if args.clip_grad_norm is not None: - # we should unscale the gradients of optimizer's assigned params if do gradient clipping - scaler.unscale_(optimizer) - nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) - scaler.step(optimizer) - scaler.update() + if step % args.gradient_accumulation_steps == 0: + if args.clip_grad_norm is not None: + # we should unscale the gradients of optimizer's assigned params if do gradient clipping + scaler.unscale_(optimizer) + nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) + scaler.step(optimizer) + optimizer.zero_grad() + scaler.update() else: loss.backward() - if args.clip_grad_norm is not None: - nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) - optimizer.step() + if step % args.gradient_accumulation_steps == 0: + if args.clip_grad_norm is not None: + nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) + optimizer.step() + optimizer.zero_grad() diff --git a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py index fbaea0e5d..1026a0695 100644 --- a/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/efficientnet-pytorch/config/config_R300x1x8.py @@ -1,4 +1,5 @@ from config_common import * train_batch_size = 64 -eval_batch_size = 64 +eval_batch_size = 128 +gradient_accumulation_steps = 2