Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine mobilenetv2 #153

Merged
merged 4 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified training/benchmarks/mobilenetv2/pytorch/config/__init__.py
100644 → 100755
Empty file.
95 changes: 38 additions & 57 deletions training/benchmarks/mobilenetv2/pytorch/config/_base.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,80 +1,61 @@
from typing import ClassVar
#from train.event.base import BaseTrainingEventInterface
# DO NOT MODIFY THESE REQUIRED PARAMETERS

# case info
# chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required.
# Required parameters
vendor: str = None
# model name
name: str = "MobileNetV2"
data_dir: str = None
name: str = "mobilenetv2"
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# Optional parameters

do_train = True
fp16 = True
# =========================================================
# data
# =========================================================
data_dir: str = None
train_data: str = "train"
eval_data: str = "val"
output_dir: str = ""
init_checkpoint: str = ""

# =========================================================
# train && evaluate
# loss scale
# =========================================================
train_batch_size: int = 8
eval_batch_size: int = 8
dist_backend: str = 'nccl'

lr: float = 0.045
lr_step_size: int = 1
lr_gamma: float = 0.98

weight_decay: float = 0.00004
gradient_accumulation_steps: int = 1
momentum: float = 0.9
lr_steps: list = 1
lr_gamma: float = 0.98

max_steps: int = 5005 * 300 # 300 epoch
seed: int = 41
# torch.backends.cudnn.benchmark
cudnn_benchmark: bool = False
# torch.backends.cudnn.deterministic
cudnn_deterministic: bool = True

# Stop training after reaching this accuracy
target_acc1: float = 70.634
# =========================================================
# train && evaluate
# =========================================================
train_batch_size: int = 8
eval_batch_size: int = 8

# Sample to begin performing eval.
eval_iter_start_samples: int = 100
# https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv2.py#L193
target_acc1: float = 68.6
# https://github.com/pytorch/vision/tree/main/references/classification
max_epoch: int = 300

# If set to -1, disable eval, else evaluate every eval_iter_samples during training
eval_interval_samples: int = 5005 * 256 * 1 # 1 epoch

# Total number of training samples to run.
max_samples_termination: float = 5005 * 256 * 300 # 300 epoch
do_train = True
fp16 = False
amp: bool = False
distributed: bool = True

# number workers for dataloader
# =========================================================
# utils
# =========================================================
seed: int = 41
dist_backend: str = 'nccl'
num_workers: int = 16
device: str = None

# local_rank for distributed training on gpus
local_rank: int = 0
# Whether to read local rank from ENVVAR
# =========================================================
# for driver
# =========================================================
local_rank: int = -1
use_env: bool = True

# Number of epochs to plan seeds for. Same set across all workers.
num_epochs_to_generate_seeds_for: int = 2

# frequency of logging loss. If not positive, no logging is provided for training loss
log_freq: int = 10

# Whether to resume training from checkpoint.
# If set, precedes init_checkpoint/init_tf_checkpoint
resume_from_checkpoint: bool = False

# A object to provide some core components in training
#training_event: ClassVar[BaseTrainingEventInterface] = None

#training_event_instance: BaseTrainingEventInterface = None

# device
device: str = None
log_freq: int = 100
print_freq: int = 100
n_device: int = 1
sync_bn: bool = False
gradient_accumulation_steps: int = 1
11 changes: 4 additions & 7 deletions training/benchmarks/mobilenetv2/pytorch/config/mutable_params.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
mutable_params = [
'train_data', 'eval_data', 'init_checkpoint', 'train_batch_size',
'eval_batch_size', 'dist_backend', 'lr', 'weight_decay',
'gradient_accumulation_steps', 'max_samples_termination', "vendor",
'cudnn_benchmark',
'cudnn_deterministic'
'vendor', 'data_dir', 'train_data', 'eval_data', 'lr', 'weight_decay',
'momentum', 'lr_steps', 'lr_gamma', 'train_batch_size', 'eval_batch_size',
'do_train', 'fp16', 'distributed', 'dist_backend', 'num_workers', 'device',
'cudnn_benchmark', 'cudnn_deterministic'
]

mutable_params += ["local_rank", "do_train", "data_dir", "log_freq"]
2 changes: 0 additions & 2 deletions training/benchmarks/mobilenetv2/pytorch/dataloaders/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from .dataloader import (build_train_dataset, build_eval_dataset,
build_train_dataloader, build_eval_dataloader)
188 changes: 89 additions & 99 deletions training/benchmarks/mobilenetv2/pytorch/dataloaders/dataloader.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,99 +1,89 @@
# coding=utf-8

import os
import sys
import random
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms
import torch.distributed as dist
from torch.utils.data.dataloader import default_collate

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import dist_pytorch


def build_train_dataset(args):
traindir = os.path.join(args.data_dir, args.train_data)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
return train_dataset


def build_eval_dataset(args):
valdir = os.path.join(args.data_dir, args.eval_data)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]))

return val_dataset


def build_train_dataloader(train_dataset, args):
"""Traing dataloaders."""
dist_pytorch.main_proc_print('building train dataloaders ...')

if torch.distributed.is_available() and torch.distributed.is_initialized():
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset)
dist_pytorch.main_proc_print(
f"use sampler: DistributedSampler, num_replicas:{args.n_device}")
else:
train_sampler = None

train_dataloader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.train_batch_size,
shuffle=(train_sampler is None),
num_workers=args.num_workers,
pin_memory=True,
sampler=train_sampler)

dist_pytorch.main_proc_print(
f'train samples:{len(train_dataset)}, batch size:{args.train_batch_size}'
)
return train_dataloader


def build_eval_dataloader(eval_dataset, args):
"""Traing and validation dataloaders."""
dist_pytorch.main_proc_print('building eval dataloaders ...')

if torch.distributed.is_available() and torch.distributed.is_initialized():
val_sampler = torch.utils.data.distributed.DistributedSampler(
eval_dataset, shuffle=False, drop_last=True)
dist_pytorch.main_proc_print(
f"use sampler: DistributedSampler, num_replicas:{args.n_device}")
else:
val_sampler = None

eval_dataloader = torch.utils.data.DataLoader(
eval_dataset,
batch_size=args.eval_batch_size,
shuffle=False,
num_workers=args.num_workers,
pin_memory=True,
sampler=val_sampler)

dist_pytorch.main_proc_print(
f'eval samples:{len(eval_dataset)}, batch size:{args.eval_batch_size}')
return eval_dataloader
import os
import torch
import torch.utils.data
import torchvision
import torchvision.transforms as t


class ToFloat16(object):

def __call__(self, tensor):
return tensor.to(dtype=torch.float16)


def build_train_dataset(config):
normalize = t.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
traindir = os.path.join(config.data_dir, config.train_data)
if config.fp16:
dataset = torchvision.datasets.ImageFolder(
traindir,
t.Compose([
t.RandomResizedCrop(224),
t.RandomHorizontalFlip(),
t.ToTensor(),
ToFloat16(), normalize
]))
else:
dataset = torchvision.datasets.ImageFolder(
traindir,
t.Compose([
t.RandomResizedCrop(224),
t.RandomHorizontalFlip(),
t.ToTensor(), normalize
]))
return dataset


def build_eval_dataset(config):
normalize = t.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
evaldir = os.path.join(config.data_dir, config.eval_data)
if config.fp16:
dataset = torchvision.datasets.ImageFolder(
evaldir,
t.Compose([
t.Resize(256),
t.CenterCrop(224),
t.ToTensor(),
ToFloat16(), normalize
]))
else:
dataset = torchvision.datasets.ImageFolder(
evaldir,
t.Compose(
[t.Resize(256),
t.CenterCrop(224),
t.ToTensor(), normalize]))
return dataset


def build_train_dataloader(dataset, config):
if config.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(
dataset)
else:
train_sampler = torch.utils.data.RandomSampler(dataset)

data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=config.train_batch_size,
sampler=train_sampler,
num_workers=config.num_workers,
pin_memory=True)
return data_loader


def build_eval_dataloader(dataset, config):
if config.distributed:
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
else:
test_sampler = torch.utils.data.SequentialSampler(dataset)

data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=config.eval_batch_size,
sampler=test_sampler,
num_workers=config.num_workers,
pin_memory=True)
return data_loader
8 changes: 3 additions & 5 deletions training/benchmarks/mobilenetv2/pytorch/model/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import torch
import torchvision
from torchvision.models import mobilenet_v2


def create_model(config):
model = torchvision.models.mobilenet_v2()
return model
def create_model():
return mobilenet_v2()
10 changes: 10 additions & 0 deletions training/benchmarks/mobilenetv2/pytorch/optimizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from torch.optim import SGD


def create_optimizer(model, args):
params = [p for p in model.parameters() if p.requires_grad]
opt = SGD(params,
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
return opt
Loading