-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* refine retinanet * update case readme * upadte case readme for bs=512 * remove 1x4 config --------- Co-authored-by: zhouyu <[email protected]>
- Loading branch information
Showing
18 changed files
with
454 additions
and
598 deletions.
There are no files selected for viewing
Empty file.
95 changes: 38 additions & 57 deletions
95
training/benchmarks/mobilenetv2/pytorch/config/_base.py
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,61 @@ | ||
from typing import ClassVar | ||
#from train.event.base import BaseTrainingEventInterface | ||
# DO NOT MODIFY THESE REQUIRED PARAMETERS | ||
|
||
# case info | ||
# chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required. | ||
# Required parameters | ||
vendor: str = None | ||
# model name | ||
name: str = "MobileNetV2" | ||
data_dir: str = None | ||
name: str = "mobilenetv2" | ||
cudnn_benchmark: bool = False | ||
cudnn_deterministic: bool = True | ||
|
||
# Optional parameters | ||
|
||
do_train = True | ||
fp16 = True | ||
# ========================================================= | ||
# data | ||
# ========================================================= | ||
data_dir: str = None | ||
train_data: str = "train" | ||
eval_data: str = "val" | ||
output_dir: str = "" | ||
init_checkpoint: str = "" | ||
|
||
# ========================================================= | ||
# train && evaluate | ||
# loss scale | ||
# ========================================================= | ||
train_batch_size: int = 8 | ||
eval_batch_size: int = 8 | ||
dist_backend: str = 'nccl' | ||
|
||
lr: float = 0.045 | ||
lr_step_size: int = 1 | ||
lr_gamma: float = 0.98 | ||
|
||
weight_decay: float = 0.00004 | ||
gradient_accumulation_steps: int = 1 | ||
momentum: float = 0.9 | ||
lr_steps: list = 1 | ||
lr_gamma: float = 0.98 | ||
|
||
max_steps: int = 5005 * 300 # 300 epoch | ||
seed: int = 41 | ||
# torch.backends.cudnn.benchmark | ||
cudnn_benchmark: bool = False | ||
# torch.backends.cudnn.deterministic | ||
cudnn_deterministic: bool = True | ||
|
||
# Stop training after reaching this accuracy | ||
target_acc1: float = 70.634 | ||
# ========================================================= | ||
# train && evaluate | ||
# ========================================================= | ||
train_batch_size: int = 8 | ||
eval_batch_size: int = 8 | ||
|
||
# Sample to begin performing eval. | ||
eval_iter_start_samples: int = 100 | ||
# https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py#L193 | ||
target_acc1: float = 68.6 | ||
# https://github.com/pytorch/vision/tree/main/references/classification | ||
max_epoch: int = 300 | ||
|
||
# If set to -1, disable eval, else evaluate every eval_iter_samples during training | ||
eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch | ||
|
||
# Total number of training samples to run. | ||
max_samples_termination: float = 5005 * 256 * 300 # 300 epoch | ||
do_train = True | ||
fp16 = False | ||
amp: bool = False | ||
distributed: bool = True | ||
|
||
# number workers for dataloader | ||
# ========================================================= | ||
# utils | ||
# ========================================================= | ||
seed: int = 41 | ||
dist_backend: str = 'nccl' | ||
num_workers: int = 16 | ||
device: str = None | ||
|
||
# local_rank for distributed training on gpus | ||
local_rank: int = 0 | ||
# Whether to read local rank from ENVVAR | ||
# ========================================================= | ||
# for driver | ||
# ========================================================= | ||
local_rank: int = -1 | ||
use_env: bool = True | ||
|
||
# Number of epochs to plan seeds for. Same set across all workers. | ||
num_epochs_to_generate_seeds_for: int = 2 | ||
|
||
# frequency of logging loss. If not positive, no logging is provided for training loss | ||
log_freq: int = 10 | ||
|
||
# Whether to resume training from checkpoint. | ||
# If set, precedes init_checkpoint/init_tf_checkpoint | ||
resume_from_checkpoint: bool = False | ||
|
||
# A object to provide some core components in training | ||
#training_event: ClassVar[BaseTrainingEventInterface] = None | ||
|
||
#training_event_instance: BaseTrainingEventInterface = None | ||
|
||
# device | ||
device: str = None | ||
log_freq: int = 100 | ||
print_freq: int = 100 | ||
n_device: int = 1 | ||
sync_bn: bool = False | ||
gradient_accumulation_steps: int = 1 |
11 changes: 4 additions & 7 deletions
11
training/benchmarks/mobilenetv2/pytorch/config/mutable_params.py
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,6 @@ | ||
mutable_params = [ | ||
'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size', | ||
'eval_batch_size', 'dist_backend', 'lr', 'weight_decay', | ||
'gradient_accumulation_steps', 'max_samples_termination', "vendor", | ||
'cudnn_benchmark', | ||
'cudnn_deterministic' | ||
'vendor', 'data_dir', 'train_data', 'eval_data', 'lr', 'weight_decay', | ||
'momentum', 'lr_steps', 'lr_gamma', 'train_batch_size', 'eval_batch_size', | ||
'do_train', 'fp16', 'distributed', 'dist_backend', 'num_workers', 'device', | ||
'cudnn_benchmark', 'cudnn_deterministic' | ||
] | ||
|
||
mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +0,0 @@ | ||
from .dataloader import (build_train_dataset, build_eval_dataset, | ||
build_train_dataloader, build_eval_dataloader) | ||
188 changes: 89 additions & 99 deletions
188
training/benchmarks/mobilenetv2/pytorch/dataloaders/dataloader.py
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,99 +1,89 @@ | ||
# coding=utf-8 | ||
|
||
import os | ||
import sys | ||
import random | ||
import numpy as np | ||
import torch | ||
from torch.utils.data import Dataset | ||
from torchvision import datasets, models, transforms | ||
import torch.distributed as dist | ||
from torch.utils.data.dataloader import default_collate | ||
|
||
CURR_PATH = os.path.abspath(os.path.dirname(__file__)) | ||
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) | ||
from driver import dist_pytorch | ||
|
||
|
||
def build_train_dataset(args): | ||
traindir = os.path.join(args.data_dir, args.train_data) | ||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], | ||
std=[0.229, 0.224, 0.225]) | ||
|
||
train_dataset = datasets.ImageFolder( | ||
traindir, | ||
transforms.Compose([ | ||
transforms.RandomResizedCrop(224), | ||
transforms.RandomHorizontalFlip(), | ||
transforms.ToTensor(), | ||
normalize, | ||
])) | ||
return train_dataset | ||
|
||
|
||
def build_eval_dataset(args): | ||
valdir = os.path.join(args.data_dir, args.eval_data) | ||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], | ||
std=[0.229, 0.224, 0.225]) | ||
|
||
val_dataset = datasets.ImageFolder( | ||
valdir, | ||
transforms.Compose([ | ||
transforms.Resize(256), | ||
transforms.CenterCrop(224), | ||
transforms.ToTensor(), | ||
normalize, | ||
])) | ||
|
||
return val_dataset | ||
|
||
|
||
def build_train_dataloader(train_dataset, args): | ||
"""Traing dataloaders.""" | ||
dist_pytorch.main_proc_print('building train dataloaders ...') | ||
|
||
if torch.distributed.is_available() and torch.distributed.is_initialized(): | ||
train_sampler = torch.utils.data.distributed.DistributedSampler( | ||
train_dataset) | ||
dist_pytorch.main_proc_print( | ||
f"use sampler: DistributedSampler, num_replicas:{args.n_device}") | ||
else: | ||
train_sampler = None | ||
|
||
train_dataloader = torch.utils.data.DataLoader( | ||
train_dataset, | ||
batch_size=args.train_batch_size, | ||
shuffle=(train_sampler is None), | ||
num_workers=args.num_workers, | ||
pin_memory=True, | ||
sampler=train_sampler) | ||
|
||
dist_pytorch.main_proc_print( | ||
f'train samples:{len(train_dataset)}, batch size:{args.train_batch_size}' | ||
) | ||
return train_dataloader | ||
|
||
|
||
def build_eval_dataloader(eval_dataset, args): | ||
"""Traing and validation dataloaders.""" | ||
dist_pytorch.main_proc_print('building eval dataloaders ...') | ||
|
||
if torch.distributed.is_available() and torch.distributed.is_initialized(): | ||
val_sampler = torch.utils.data.distributed.DistributedSampler( | ||
eval_dataset, shuffle=False, drop_last=True) | ||
dist_pytorch.main_proc_print( | ||
f"use sampler: DistributedSampler, num_replicas:{args.n_device}") | ||
else: | ||
val_sampler = None | ||
|
||
eval_dataloader = torch.utils.data.DataLoader( | ||
eval_dataset, | ||
batch_size=args.eval_batch_size, | ||
shuffle=False, | ||
num_workers=args.num_workers, | ||
pin_memory=True, | ||
sampler=val_sampler) | ||
|
||
dist_pytorch.main_proc_print( | ||
f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}') | ||
return eval_dataloader | ||
import os | ||
import torch | ||
import torch.utils.data | ||
import torchvision | ||
import torchvision.transforms as t | ||
|
||
|
||
class ToFloat16(object): | ||
|
||
def __call__(self, tensor): | ||
return tensor.to(dtype=torch.float16) | ||
|
||
|
||
def build_train_dataset(config): | ||
normalize = t.Normalize(mean=[0.485, 0.456, 0.406], | ||
std=[0.229, 0.224, 0.225]) | ||
traindir = os.path.join(config.data_dir, config.train_data) | ||
if config.fp16: | ||
dataset = torchvision.datasets.ImageFolder( | ||
traindir, | ||
t.Compose([ | ||
t.RandomResizedCrop(224), | ||
t.RandomHorizontalFlip(), | ||
t.ToTensor(), | ||
ToFloat16(), normalize | ||
])) | ||
else: | ||
dataset = torchvision.datasets.ImageFolder( | ||
traindir, | ||
t.Compose([ | ||
t.RandomResizedCrop(224), | ||
t.RandomHorizontalFlip(), | ||
t.ToTensor(), normalize | ||
])) | ||
return dataset | ||
|
||
|
||
def build_eval_dataset(config): | ||
normalize = t.Normalize(mean=[0.485, 0.456, 0.406], | ||
std=[0.229, 0.224, 0.225]) | ||
evaldir = os.path.join(config.data_dir, config.eval_data) | ||
if config.fp16: | ||
dataset = torchvision.datasets.ImageFolder( | ||
evaldir, | ||
t.Compose([ | ||
t.Resize(256), | ||
t.CenterCrop(224), | ||
t.ToTensor(), | ||
ToFloat16(), normalize | ||
])) | ||
else: | ||
dataset = torchvision.datasets.ImageFolder( | ||
evaldir, | ||
t.Compose( | ||
[t.Resize(256), | ||
t.CenterCrop(224), | ||
t.ToTensor(), normalize])) | ||
return dataset | ||
|
||
|
||
def build_train_dataloader(dataset, config): | ||
if config.distributed: | ||
train_sampler = torch.utils.data.distributed.DistributedSampler( | ||
dataset) | ||
else: | ||
train_sampler = torch.utils.data.RandomSampler(dataset) | ||
|
||
data_loader = torch.utils.data.DataLoader( | ||
dataset, | ||
batch_size=config.train_batch_size, | ||
sampler=train_sampler, | ||
num_workers=config.num_workers, | ||
pin_memory=True) | ||
return data_loader | ||
|
||
|
||
def build_eval_dataloader(dataset, config): | ||
if config.distributed: | ||
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset) | ||
else: | ||
test_sampler = torch.utils.data.SequentialSampler(dataset) | ||
|
||
data_loader = torch.utils.data.DataLoader( | ||
dataset, | ||
batch_size=config.eval_batch_size, | ||
sampler=test_sampler, | ||
num_workers=config.num_workers, | ||
pin_memory=True) | ||
return data_loader |
8 changes: 3 additions & 5 deletions
8
training/benchmarks/mobilenetv2/pytorch/model/__init__.py
100644 → 100755
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
import torch | ||
import torchvision | ||
from torchvision.models import mobilenet_v2 | ||
|
||
|
||
def create_model(config): | ||
model = torchvision.models.mobilenet_v2() | ||
return model | ||
def create_model(): | ||
return mobilenet_v2() |
10 changes: 10 additions & 0 deletions
10
training/benchmarks/mobilenetv2/pytorch/optimizers/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from torch.optim import SGD | ||
|
||
|
||
def create_optimizer(model, args): | ||
params = [p for p in model.parameters() if p.requires_grad] | ||
opt = SGD(params, | ||
lr=args.lr, | ||
momentum=args.momentum, | ||
weight_decay=args.weight_decay) | ||
return opt |
Oops, something went wrong.