FlagOpen · upvenly · Oct 8, 2023 · Sep 27, 2023 · Oct 7, 2023
diff --git a/training/benchmarks/efficientnet/README.md b/training/benchmarks/efficientnet/README.md
@@ -24,10 +24,8 @@ Neither the name of the copyright holder nor the names of its contributors may b
 
 
 ### 框架与芯片支持情况
-|     | Pytorch  |Paddle|TensorFlow2|
-|  ----  | ----  |  ----  | ----  |
-| Nvidia GPU | ✅ |N/A  |N/A|
-| 昆仑芯 XPU | ✅ |N/A  |N/A|
-
-
-
+|              | Pytorch | Paddle | TensorFlow2 |
+| ------------ | ------- | ------ | ----------- |
+| Nvidia GPU   | ✅       | N/A    | N/A         |
+| 昆仑芯 XPU   | ✅       | N/A    | N/A         |
+| 天数智芯 GPU | ✅       | N/A    | N/A         |
diff --git a/training/benchmarks/efficientnet/pytorch/config/_base.py b/training/benchmarks/efficientnet/pytorch/config/_base.py
@@ -158,3 +158,5 @@
 # device
 device: str = None
 n_device: int = 1
+
+distributed : bool = False
diff --git a/training/benchmarks/efficientnet/pytorch/model/__init__.py b/training/benchmarks/efficientnet/pytorch/model/__init__.py
@@ -1,4 +1,3 @@
-import torch
 import torchvision
 
 

diff --git a/training/benchmarks/efficientnet/pytorch/run_pretraining.py b/training/benchmarks/efficientnet/pytorch/run_pretraining.py
@@ -29,6 +29,7 @@ def main() -> Tuple[Any, Any]:
     dist_pytorch.init_dist_training_env(config)
     dist_pytorch.barrier(config.vendor)
     model_driver.event(Event.INIT_START)
+    config.distributed = dist_pytorch.get_world_size() > 1
 
     logger = model_driver.logger
     init_start_time = logger.previous_log_time
@@ -75,19 +76,15 @@ def main() -> Tuple[Any, Any]:
 
     dist_pytorch.barrier(config.vendor)
     model_driver.event(Event.TRAIN_START)
-    raw_train_start_time = logger.previous_log_time
+    raw_train_start_time = time.time()
 
     while training_state.epoch < config.epochs and \
             not training_state.end_training:
         trainer.train_one_epoch(train_dataloader)
         training_state.epoch += 1
 
     model_driver.event(Event.TRAIN_END)
-    raw_train_end_time = logger.previous_log_time
-
-    training_state.raw_train_time = (raw_train_end_time -
-                                     raw_train_start_time) / 1e+3
-
+    training_state.raw_train_time = time.time() - raw_train_start_time
     return config, training_state
 
 
@@ -104,13 +101,33 @@ def main() -> Tuple[Any, Any]:
         training_perf = (global_batch_size *
                          state.global_steps) / state.raw_train_time
         finished_info = {
-            "e2e_time": e2e_time,
-            "training_images_per_second": training_perf,
-            "converged": state.converged,
-            "final_loss": state.eval_loss,
-            "final_acc1": state.eval_acc1,
-            "final_acc5": state.eval_acc5,
-            "raw_train_time": state.raw_train_time,
-            "init_time": state.init_time,
+            "e2e_time":
+            e2e_time,
+            "training_images_per_second":
+            training_perf,
+            "num_trained_samples":
+            state.num_trained_samples,
+            "global_steps":
+            state.global_steps,
+            "converged":
+            state.converged,
+            "final_loss":
+            state.eval_loss,
+            "final_acc1":
+            state.eval_acc1,
+            "final_acc5":
+            state.eval_acc5,
+            "raw_train_time":
+            state.raw_train_time,
+            "init_time":
+            state.init_time,
+            "pure_training_computing_time":
+            state.pure_compute_time,
+            "throughput(ips)_raw":
+            state.num_trained_samples / state.raw_train_time,
+            "throughput(ips)_no_eval":
+            state.num_trained_samples / state.no_eval_time,
+            "throughput(ips)_pure_compute":
+            state.num_trained_samples / state.pure_compute_time,
         }
     logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer.py b/training/benchmarks/efficientnet/pytorch/train/trainer.py
@@ -1,20 +1,19 @@
-import torch
-from torch.types import Device
-import torch.distributed as dist
 import os
 import sys
 import time
 import math
 
+import torch
+import torch.distributed as dist
+from torch.types import Device
+
+import config
 from model import create_model
 from schedulers import create_scheduler
-
 from train.evaluator import Evaluator
 from train.training_state import TrainingState
 from train import utils
 
-import config
-
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
 from driver import Driver, Event, dist_pytorch
@@ -75,11 +74,11 @@ def train_one_epoch(self, dataloader):
         driver = self.driver
         driver.event(Event.EPOCH_BEGIN, state.epoch)
 
-        step_start_time = time.time()
-        epoch_start_num_sample = state.num_trained_samples
-
         if dist_pytorch.is_dist_avail_and_initialized():
             dataloader.sampler.set_epoch(state.epoch)
+
+        no_eval_start_time = time.time()
+
         for batch_idx, batch in enumerate(dataloader):
 
             state.global_steps += 1
@@ -89,46 +88,32 @@ def train_one_epoch(self, dataloader):
 
             driver.event(Event.STEP_BEGIN, step=state.global_steps)
             self.train_one_step(batch)
+            info = {
+                "num_trained_samples": state.num_trained_samples,
+                "epoch": state.epoch,
+                "steps": state.global_steps,
+            }
+            driver.event(Event.STEP_END, step=state.global_steps, message=info)
 
-            other_state = dict()
-
-            step_end_time = time.time()
-            step_total_time = step_end_time - step_start_time
-            step_start_time = step_end_time
-            images_per_second = dist_pytorch.global_batch_size(
-                self.config) / step_total_time
-            other_state["img/s"] = images_per_second
-            if hasattr(self.optimizer, 'loss_scaler'):
-                loss_scale = self.optimizer.loss_scaler.loss_scale
-                other_state['loss_scale'] = loss_scale
-
-            eval_result = None
-            if self.can_do_eval(state):
-                eval_start = time.time()
-                state.eval_loss, state.eval_acc1, state.eval_acc5 = self.evaluator.evaluate(
-                    self)
-                eval_end = time.time()
-                eval_result = dict(global_steps=state.global_steps,
-                                   eval_loss=state.eval_loss,
-                                   eval_acc1=state.eval_acc1,
-                                   eval_acc5=state.eval_acc5,
-                                   time=eval_end - eval_start)
-
-            end_training = self.detect_training_status(state)
-            step_info = state.to_dict(**other_state)
-            driver.event(Event.STEP_END,
-                         message=step_info,
-                         step=state.global_steps,
-                         loss=state.loss)
-
-            if eval_result is not None:
-                driver.event(Event.EVALUATE, eval_result)
-
-            if end_training:
+            if self.detect_training_status(state):
                 break
 
-        epoch_start_num_sample += len(dataloader.dataset)
-        state.num_trained_samples = epoch_start_num_sample
+        state.no_eval_time += time.time() - no_eval_start_time
+
+        eval_start = time.time()
+        state.eval_loss, state.eval_acc1, state.eval_acc5 = self.evaluator.evaluate(
+            self)
+        eval_end = time.time()
+        eval_result = dict(
+            global_steps=state.global_steps,
+            eval_loss=state.eval_loss,
+            eval_acc1=state.eval_acc1,
+            eval_acc5=state.eval_acc5,
+            time=eval_end - eval_start,
+        )
+        print("eval_result", eval_result)
+
+
 
         self.lr_scheduler.step()
         if self.config.output_dir:
@@ -154,44 +139,26 @@ def train_one_step(self, batch):
         batch = self.process_batch(batch, self.config.device)
         state = self.training_state
         self.model.train()
+
+        pure_compute_start_time = time.time()
         state.loss, state.acc1, state.acc5 = self.forward(batch)
-        self.adapter.backward(self.config, state.global_steps, state.epoch,
-                              state.loss, self.model, self.optimizer,
-                              self.scaler)
-        if dist_pytorch.is_dist_avail_and_initialized():
-            total = torch.tensor([state.loss, state.acc1, state.acc5],
-                                 dtype=torch.float32,
-                                 device=self.config.device)
-            dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
-            total = total / dist.get_world_size()
-            state.loss, state.acc1, state.acc5 = total.tolist()
-        self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
-                          self.optimizer, self.scaler)
+        self.adapter.backward(self.config, state.global_steps, state.loss,
+                              self.model, self.optimizer, self.scaler)
+        state.pure_compute_time += time.time() - pure_compute_start_time
 
     def detect_training_status(self, state):
         config = self.config
         if state.eval_acc1 >= config.target_acc1:
+            dist_pytorch.main_proc_print(
+                f"converged_success. eval_acc1:{state.eval_acc1} target_acc1:{config.target_acc1}"
+            )
             state.converged_success()
 
         if state.num_trained_samples > config.max_samples_termination:
             state.end_training = True
 
         return state.end_training
 
-    def can_do_eval(self, state):
-        config = self.config
-        do_eval = all([
-            config.eval_data is not None,
-            state.num_trained_samples >= config.eval_iter_start_samples,
-            state.global_steps %
-            math.ceil(config.eval_interval_samples /
-                      dist_pytorch.global_batch_size(config)) == 0,
-            config.eval_interval_samples > 0,
-            state.global_steps > 1,
-        ])
-
-        return do_eval or state.num_trained_samples >= config.max_samples_termination
-
     def forward(self, batch):
         images, target = batch
         output = self.model(images)

diff --git a/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py b/training/benchmarks/efficientnet/pytorch/train/trainer_adapter.py
@@ -86,7 +86,7 @@ def create_grad_scaler(args):
     return scaler
 
 
-def backward(args, step: int, epoch: int, loss: torch.Tensor, model: nn.Module,
+def backward(args, step: int, loss: torch.Tensor, model: nn.Module,
              optimizer: Optimizer, scaler):
     if scaler is not None:
         scaler.scale(loss).backward()

diff --git a/training/benchmarks/efficientnet/pytorch/train/training_state.py b/training/benchmarks/efficientnet/pytorch/train/training_state.py
@@ -27,6 +27,9 @@ class TrainingState:
     init_time = 0
     raw_train_time = 0
 
+    no_eval_time = 0
+    pure_compute_time = 0
+
     def status(self):
         if self.converged:
             self._status = "success"

diff --git a/training/nvidia/efficientnet-pytorch/README.md b/training/nvidia/efficientnet-pytorch/README.md
@@ -13,18 +13,36 @@
    - OS kernel版本: 5.4.0-113-generic     
    - 加速卡驱动版本：470.129.06
    - Docker 版本：20.10.16
-   - 训练框架版本：pytorch-1.8.0a0+52ea372
+   - 训练框架版本：pytorch-1.13.0a0+936e930
    - 依赖软件版本：无
 
 
 ### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
-| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
-| 单机1卡  | config_A100x1x1 |      |       |    |     |             |
-| 单机2卡  | config_A100x1x2 |      |       |    |     |             |
-| 单机4卡  | config_A100x1x4 |      |       |    |     |             |
-| 单机8卡  | config_A100x1x8 |   328383.49   |   82.672  |  82.672  |  750600  |    2340.6    |
-| 两机8卡  | config_A100x2x8 |      |       |    |     |             |
-
-[官方精度](https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py#L669)为84.228，按照[官方配置](https://github.com/pytorch/vision/blob/main/references/classification/README.md)，训完得到的精度为82.672，后续排期优化
+
+* 通用指标
+
+| 指标名称         | 指标值                         | 特殊说明                                    |
+| ---------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别         | Image Classification           |                                             |
+| 模型             | EfficientNet                   |                                             |
+| 数据集           | ImageNet                       |                                             |
+| 数据精度         | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改         | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称     | nvidia A100                    |                                             |
+| 硬件存储使用     | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 端到端时间       | e2e_time,见“性能指标”          | 总时间+Perf初始化等时间                     |
+| 总吞吐量         | p_whole,见“性能指标”           | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量       | p_train,见“性能指标”           | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量**   | **p_core,见“性能指标”**        | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| **计算卡使用率** | **\*MFU**                      | model flops utilization                     |
+| 训练结果         | acc,见“性能指标”               | 分类准确率                                  |
+| 额外修改项       | 无                             |                                             |
+
+* 性能指标
+
+| 配置              | precision | fix_hp        | e2e_time | p_whole | p_train | p_core | acc   | mem       |
+| ----------------- | --------- | ------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| A100单机8卡(1x8)  | fp32      | /             | 320691   | 2297    | 2340    | 2367   | 82.72 | 38.9/40.0 |
+| A100单机单卡(1x1) | fp32      | bs=128,lr=0.1 |          | 302.7   | 308     | 310    |       | 38.1/40.0 |
+| A100两机8卡(2x8)  | fp32      | bs=128,lr=0.1 |          | 4488    | 4603    | 4686   |       | 38.3/40.0 |
 
diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py b/training/nvidia/efficientnet-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,5 @@
+from config_common import *
+
+train_batch_size = 128
+eval_batch_size = 128
+lr = 0.1
diff --git a/training/nvidia/efficientnet-pytorch/config/config_A100x2x8.py b/training/nvidia/efficientnet-pytorch/config/config_A100x2x8.py
@@ -0,0 +1,5 @@
+from config_common import *
+
+train_batch_size = 128
+eval_batch_size = 128
+lr = 0.1