Skip to content

Commit

Permalink
refine mobilenetv2 (#153)
Browse files Browse the repository at this point in the history
* refine retinanet

* update case readme

* upadte case readme for bs=512

* remove 1x4 config

---------

Co-authored-by: zhouyu <[email protected]>
  • Loading branch information
yuzhou03 and zhouyu authored Jul 21, 2023
1 parent 781a768 commit dc66ac3
Show file tree
Hide file tree
Showing 18 changed files with 454 additions and 598 deletions.
Empty file modified training/benchmarks/mobilenetv2/pytorch/config/__init__.py
100644 → 100755
Empty file.
95 changes: 38 additions & 57 deletions training/benchmarks/mobilenetv2/pytorch/config/_base.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,80 +1,61 @@
from typing import ClassVar
#from train.event.base import BaseTrainingEventInterface
# DO NOT MODIFY THESE REQUIRED PARAMETERS

# case info
# chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required.
# Required parameters
vendor: str = None
# model name
name: str = "MobileNetV2"
data_dir: str = None
name: str = "mobilenetv2"
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# Optional parameters

do_train = True
fp16 = True
# =========================================================
# data
# =========================================================
data_dir: str = None
train_data: str = "train"
eval_data: str = "val"
output_dir: str = ""
init_checkpoint: str = ""

# =========================================================
# train && evaluate
# loss scale
# =========================================================
train_batch_size: int = 8
eval_batch_size: int = 8
dist_backend: str = 'nccl'

lr: float = 0.045
lr_step_size: int = 1
lr_gamma: float = 0.98

weight_decay: float = 0.00004
gradient_accumulation_steps: int = 1
momentum: float = 0.9
lr_steps: list = 1
lr_gamma: float = 0.98

max_steps: int = 5005 * 300 # 300 epoch
seed: int = 41
# torch.backends.cudnn.benchmark
cudnn_benchmark: bool = False
# torch.backends.cudnn.deterministic
cudnn_deterministic: bool = True

# Stop training after reaching this accuracy
target_acc1: float = 70.634
# =========================================================
# train && evaluate
# =========================================================
train_batch_size: int = 8
eval_batch_size: int = 8

# Sample to begin performing eval.
eval_iter_start_samples: int = 100
# https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py#L193
target_acc1: float = 68.6
# https://github.com/pytorch/vision/tree/main/references/classification
max_epoch: int = 300

# If set to -1, disable eval, else evaluate every eval_iter_samples during training
eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch

# Total number of training samples to run.
max_samples_termination: float = 5005 * 256 * 300 # 300 epoch
do_train = True
fp16 = False
amp: bool = False
distributed: bool = True

# number workers for dataloader
# =========================================================
# utils
# =========================================================
seed: int = 41
dist_backend: str = 'nccl'
num_workers: int = 16
device: str = None

# local_rank for distributed training on gpus
local_rank: int = 0
# Whether to read local rank from ENVVAR
# =========================================================
# for driver
# =========================================================
local_rank: int = -1
use_env: bool = True

# Number of epochs to plan seeds for. Same set across all workers.
num_epochs_to_generate_seeds_for: int = 2

# frequency of logging loss. If not positive, no logging is provided for training loss
log_freq: int = 10

# Whether to resume training from checkpoint.
# If set, precedes init_checkpoint/init_tf_checkpoint
resume_from_checkpoint: bool = False

# A object to provide some core components in training
#training_event: ClassVar[BaseTrainingEventInterface] = None

#training_event_instance: BaseTrainingEventInterface = None

# device
device: str = None
log_freq: int = 100
print_freq: int = 100
n_device: int = 1
sync_bn: bool = False
gradient_accumulation_steps: int = 1
11 changes: 4 additions & 7 deletions training/benchmarks/mobilenetv2/pytorch/config/mutable_params.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
mutable_params = [
'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size',
'eval_batch_size', 'dist_backend', 'lr', 'weight_decay',
'gradient_accumulation_steps', 'max_samples_termination', "vendor",
'cudnn_benchmark',
'cudnn_deterministic'
'vendor', 'data_dir', 'train_data', 'eval_data', 'lr', 'weight_decay',
'momentum', 'lr_steps', 'lr_gamma', 'train_batch_size', 'eval_batch_size',
'do_train', 'fp16', 'distributed', 'dist_backend', 'num_workers', 'device',
'cudnn_benchmark', 'cudnn_deterministic'
]

mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"]
2 changes: 0 additions & 2 deletions training/benchmarks/mobilenetv2/pytorch/dataloaders/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from .dataloader import (build_train_dataset, build_eval_dataset,
build_train_dataloader, build_eval_dataloader)
188 changes: 89 additions & 99 deletions training/benchmarks/mobilenetv2/pytorch/dataloaders/dataloader.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,99 +1,89 @@
# coding=utf-8

import os
import sys
import random
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms
import torch.distributed as dist
from torch.utils.data.dataloader import default_collate

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import dist_pytorch


def build_train_dataset(args):
traindir = os.path.join(args.data_dir, args.train_data)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
return train_dataset


def build_eval_dataset(args):
valdir = os.path.join(args.data_dir, args.eval_data)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))

return val_dataset


def build_train_dataloader(train_dataset, args):
"""Traing dataloaders."""
dist_pytorch.main_proc_print('building train dataloaders ...')

if torch.distributed.is_available() and torch.distributed.is_initialized():
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset)
dist_pytorch.main_proc_print(
f"use sampler: DistributedSampler, num_replicas:{args.n_device}")
else:
train_sampler = None

train_dataloader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.train_batch_size,
shuffle=(train_sampler is None),
num_workers=args.num_workers,
pin_memory=True,
sampler=train_sampler)

dist_pytorch.main_proc_print(
f'train samples:{len(train_dataset)}, batch size:{args.train_batch_size}'
)
return train_dataloader


def build_eval_dataloader(eval_dataset, args):
"""Traing and validation dataloaders."""
dist_pytorch.main_proc_print('building eval dataloaders ...')

if torch.distributed.is_available() and torch.distributed.is_initialized():
val_sampler = torch.utils.data.distributed.DistributedSampler(
eval_dataset, shuffle=False, drop_last=True)
dist_pytorch.main_proc_print(
f"use sampler: DistributedSampler, num_replicas:{args.n_device}")
else:
val_sampler = None

eval_dataloader = torch.utils.data.DataLoader(
eval_dataset,
batch_size=args.eval_batch_size,
shuffle=False,
num_workers=args.num_workers,
pin_memory=True,
sampler=val_sampler)

dist_pytorch.main_proc_print(
f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}')
return eval_dataloader
import os
import torch
import torch.utils.data
import torchvision
import torchvision.transforms as t


class ToFloat16(object):

def __call__(self, tensor):
return tensor.to(dtype=torch.float16)


def build_train_dataset(config):
normalize = t.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
traindir = os.path.join(config.data_dir, config.train_data)
if config.fp16:
dataset = torchvision.datasets.ImageFolder(
traindir,
t.Compose([
t.RandomResizedCrop(224),
t.RandomHorizontalFlip(),
t.ToTensor(),
ToFloat16(), normalize
]))
else:
dataset = torchvision.datasets.ImageFolder(
traindir,
t.Compose([
t.RandomResizedCrop(224),
t.RandomHorizontalFlip(),
t.ToTensor(), normalize
]))
return dataset


def build_eval_dataset(config):
normalize = t.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
evaldir = os.path.join(config.data_dir, config.eval_data)
if config.fp16:
dataset = torchvision.datasets.ImageFolder(
evaldir,
t.Compose([
t.Resize(256),
t.CenterCrop(224),
t.ToTensor(),
ToFloat16(), normalize
]))
else:
dataset = torchvision.datasets.ImageFolder(
evaldir,
t.Compose(
[t.Resize(256),
t.CenterCrop(224),
t.ToTensor(), normalize]))
return dataset


def build_train_dataloader(dataset, config):
if config.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(
dataset)
else:
train_sampler = torch.utils.data.RandomSampler(dataset)

data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=config.train_batch_size,
sampler=train_sampler,
num_workers=config.num_workers,
pin_memory=True)
return data_loader


def build_eval_dataloader(dataset, config):
if config.distributed:
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
else:
test_sampler = torch.utils.data.SequentialSampler(dataset)

data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=config.eval_batch_size,
sampler=test_sampler,
num_workers=config.num_workers,
pin_memory=True)
return data_loader
8 changes: 3 additions & 5 deletions training/benchmarks/mobilenetv2/pytorch/model/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import torch
import torchvision
from torchvision.models import mobilenet_v2


def create_model(config):
model = torchvision.models.mobilenet_v2()
return model
def create_model():
return mobilenet_v2()
10 changes: 10 additions & 0 deletions training/benchmarks/mobilenetv2/pytorch/optimizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from torch.optim import SGD


def create_optimizer(model, args):
params = [p for p in model.parameters() if p.requires_grad]
opt = SGD(params,
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
return opt
Loading

0 comments on commit dc66ac3

Please sign in to comment.