From 2d1c4a45496473cc1eff6663942cfda3fee1e83e Mon Sep 17 00:00:00 2001 From: zhouyu Date: Tue, 8 Aug 2023 13:29:11 +0800 Subject: [PATCH 1/7] refine GLM --- .../benchmarks/glm/pytorch/config/_base.py | 8 ++- .../benchmarks/glm/pytorch/run_pretraining.py | 66 ++++++++----------- .../benchmarks/glm/pytorch/train/evaluator.py | 7 +- .../benchmarks/glm/pytorch/train/trainer.py | 54 +++++++-------- .../glm/pytorch/train/trainer_adapter.py | 6 -- .../glm/pytorch/train/training_state.py | 6 ++ training/nvidia/glm-pytorch/README.md | 31 +++++++-- .../glm-pytorch/config/config_A100x1x1.py | 6 +- 8 files changed, 95 insertions(+), 89 deletions(-) diff --git a/training/benchmarks/glm/pytorch/config/_base.py b/training/benchmarks/glm/pytorch/config/_base.py index a8d2651a8..4c8e2f073 100644 --- a/training/benchmarks/glm/pytorch/config/_base.py +++ b/training/benchmarks/glm/pytorch/config/_base.py @@ -1,18 +1,20 @@ -from typing import ClassVar -#from train.event.base import BaseTrainingEventInterface +# required parameters # case info # chip vendor: nvidia, kunlunxin, iluvatar, cambricon etc. key vendor is required. vendor: str = None # model name name: str = "GLM" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True +data_dir: str = None do_train = True fp16 = True # ========================================================= # data # ========================================================= -data_dir: str = "/mnt/data/glm/train/" + train_data: str = "ReCoRD/glm_train_eval_hdf5_sparse/train_hdf5/train_sparse.hdf5" eval_data: str = "ReCoRD/glm_train_eval_hdf5_sparse/eval_hdf5/eval_sparse.hdf5" output_dir: str = "" diff --git a/training/benchmarks/glm/pytorch/run_pretraining.py b/training/benchmarks/glm/pytorch/run_pretraining.py index 5aab33748..b56fcc1bc 100644 --- a/training/benchmarks/glm/pytorch/run_pretraining.py +++ b/training/benchmarks/glm/pytorch/run_pretraining.py @@ -1,41 +1,39 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") """GLM Pretraining""" -import time -import argparse import os import sys -import numpy as np +import time import torch -import random +import config from dataloaders import (WorkerInitializer, build_train_dataloader, build_eval_dataloaders) from train.trainer import Trainer, Evaluator from train.training_state import TrainingState -# from train.event import TrainingEventCompose, TrainingLogger, BaseTrainingEventInterface - from train import trainer_adapter CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) -import driver -from driver import Driver, Event, dist_pytorch, check +from driver import Event, dist_pytorch, check +from driver.helper import InitHelper logger = None def main(): - import config + from config import mutable_params global logger + global config if config.use_env and 'LOCAL_RANK' in os.environ: config.local_rank = int(os.environ['LOCAL_RANK']) - glm_driver = Driver(config, config.mutable_params) - glm_driver.setup_config(argparse.ArgumentParser("Glm")) - glm_driver.setup_modules(globals(), locals()) - + init_helper = InitHelper(config) + glm_driver = init_helper.init_driver(globals(), locals()) logger = glm_driver.logger dist_pytorch.init_dist_training_env(config) @@ -54,12 +52,13 @@ def main(): else: worker_seed = worker_seeds[0] - random.seed(worker_seed) - np.random.seed(worker_seed) - torch.manual_seed(worker_seed) + init_helper.set_seed(config.seed, config.vendor) + worker_init = WorkerInitializer.default(worker_seed) + train_dataloader = build_train_dataloader(config, worker_init) + eval_dataloader = build_eval_dataloaders(config) - evaluator = Evaluator(config, None) + evaluator = Evaluator(config, eval_dataloader) training_state = TrainingState() trainer = Trainer(driver=glm_driver, adapter=trainer_adapter, @@ -72,71 +71,64 @@ def main(): dist_pytorch.barrier(config.vendor) trainer.init() - eval_dataloader = build_eval_dataloaders(config) - dist_pytorch.barrier(config.vendor) init_evaluation_start = time.time() - evaluator.dataloader = eval_dataloader score = trainer.evaluator.evaluate(trainer) training_state.eval_accuracy = score init_evaluation_end = time.time() init_evaluation_info = dict(eval_accuracy=score, time=init_evaluation_end - init_evaluation_start) - # training_event.on_init_evaluate(init_evaluation_info) glm_driver.event(Event.INIT_EVALUATION, init_evaluation_info) - train_dataloader = build_train_dataloader(config, worker_init) - if not config.do_train: return config, training_state - # training_event.on_init_end() glm_driver.event(Event.INIT_END) init_end_time = logger.previous_log_time training_state.init_time = (init_end_time - init_start_time) / 1e+3 dist_pytorch.barrier(config.vendor) - epoch = -1 - # training_event.on_train_begin() glm_driver.event(Event.TRAIN_START) - raw_train_start_time = logger.previous_log_time + raw_train_start_time = time.time() + epoch = 0 while training_state.num_trained_samples < config.max_samples_termination and not training_state.end_training: - epoch += 1 training_state.epoch = epoch - train_dataloader.sampler.set_epoch(epoch) trainer.train_one_epoch(train_dataloader) + epoch += 1 - # training_event.on_train_end() glm_driver.event(Event.TRAIN_END) - raw_train_end_time = logger.previous_log_time - training_state.raw_train_time = (raw_train_end_time - - raw_train_start_time) / 1e+3 + training_state.raw_train_time = time.time() - raw_train_start_time return config, training_state if __name__ == "__main__": - now = time.time() - config, state = main() + config_upadted, state = main() if not dist_pytorch.is_main_process(): sys.exit() e2e_time = time.time() - now - if config.do_train: - training_perf = (dist_pytorch.global_batch_size(config) * + if config_upadted.do_train: + training_perf = (dist_pytorch.global_batch_size(config_upadted) * state.global_steps) / state.raw_train_time finished_info = { "e2e_time": e2e_time, + "global_steps": state.global_steps, + "num_trained_samples": state.num_trained_samples, "training_sequences_per_second": training_perf, "converged": state.converged, "final_accuracy": state.eval_accuracy, "raw_train_time": state.raw_train_time, "init_time": state.init_time, + "pure_training_computing_time": state.pure_compute_time, + "throughput(ips)_raw": state.num_trained_samples / state.raw_train_time, + "throughput(ips)_no_eval": state.num_trained_samples / state.no_eval_time, + "throughput(ips)_pure_compute": state.num_trained_samples / state.pure_compute_time, } else: finished_info = {"e2e_time": e2e_time} diff --git a/training/benchmarks/glm/pytorch/train/evaluator.py b/training/benchmarks/glm/pytorch/train/evaluator.py index cfefb7c40..d471ffced 100644 --- a/training/benchmarks/glm/pytorch/train/evaluator.py +++ b/training/benchmarks/glm/pytorch/train/evaluator.py @@ -1,4 +1,6 @@ -# coding=utf-8 +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") import torch @@ -58,14 +60,12 @@ def multichoice_evaluate(model, dataloader, args, segment_length=10): total_score += batch_score model.train() - #config.training_event_instance.device_barrier() if torch.distributed.is_available() and torch.distributed.is_initialized(): torch.distributed.all_reduce(total_sample, op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce(total_score, op=torch.distributed.ReduceOp.SUM) - # print(f"samples:{total_sample}, score:{total_score}") score = total_score / total_sample return score.item() @@ -77,5 +77,4 @@ def em_evaluate(predictions, labels): for pred, true_list in zip(predictions, labels): if pred in true_list: score += 1 - # score = 100.0 * score / len(predictions) return score diff --git a/training/benchmarks/glm/pytorch/train/trainer.py b/training/benchmarks/glm/pytorch/train/trainer.py index 37db435ba..f0a9ea1c5 100644 --- a/training/benchmarks/glm/pytorch/train/trainer.py +++ b/training/benchmarks/glm/pytorch/train/trainer.py @@ -1,27 +1,23 @@ -import torch -from torch.types import Device -import os -import sys +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") import time import math +import torch +from torch.types import Device + +import config from model import create_model from schedulers import create_scheduler - from train.evaluator import Evaluator from train.training_state import TrainingState - -import config - -CURR_PATH = os.path.abspath(os.path.dirname(__file__)) -sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) from driver import Driver, Event, dist_pytorch def process_batch(batch, device): """Process batch and produce inputs for the model.""" batch = {t: batch[t].to(device) for t in batch if t != 'answer_idx'} - return batch @@ -64,8 +60,6 @@ def _init_model(self, model, args, device): # Load the checkpoint. sd = torch.load(checkpoint_name, map_location='cpu') - # model = model.module - # Model. def extend_embedding_weights(state_weights, model_weights): original_length = state_weights.shape[0] @@ -111,20 +105,27 @@ def extend_embedding_weights(state_weights, model_weights): def train_one_epoch(self, dataloader): state = self.training_state driver = self.driver + dataloader.sampler.set_epoch(state.epoch) driver.event(Event.EPOCH_BEGIN, state.epoch) step_start_time = time.time() - epoch_start_num_sample = state.num_trained_samples + + no_eval_start_time = time.time() + iter_end_time = no_eval_start_time for batch_idx, batch in enumerate(dataloader): + iter_start_time = time.time() + dataload_time = iter_start_time - iter_end_time state.global_steps += 1 # TODO: Maybe we should update num_trained_samples after all epochs. state.num_trained_samples = state.global_steps * \ - dist_pytorch.global_batch_size(self.config) + dist_pytorch.global_batch_size(self.config) driver.event(Event.STEP_BEGIN, step=state.global_steps) self.train_one_step(batch) + self.training_state.no_eval_time += ( + time.time() - iter_start_time) + dataload_time other_state = dict() if state.global_steps % self.config.gradient_accumulation_steps == 0: @@ -159,43 +160,36 @@ def train_one_epoch(self, dataloader): if eval_result is not None: driver.event(Event.EVALUATE, eval_result) + iter_end_time = time.time() + if end_training: break - epoch_start_num_sample += len(dataloader.dataset) - state.num_trained_samples = epoch_start_num_sample - driver.event(Event.EPOCH_END, state.epoch) def train_one_step(self, batch): data = process_batch(batch, self.config.device) state = self.training_state - # self.training_event.on_step_begin(state.global_steps) self.model.train() + pure_compute_start_time = time.time() lm_loss, _ = self.forward(data) lm_loss /= self.config.gradient_accumulation_steps reduced_loss = lm_loss.detach().clone().view(1) - if torch.distributed.is_available( - ) and torch.distributed.is_initialized(): + if dist_pytorch.is_dist_avail_and_initialized(): torch.distributed.all_reduce(reduced_loss.data) reduced_loss.data = reduced_loss.data / (dist_pytorch.get_world_size()) state.loss = lm_loss - #lm_loss.backward() - #self.optimizer.step() self.adapter.backward(state.global_steps, lm_loss, reduced_loss, self.optimizer, self.lr_scheduler, self.model) - #self.adapter.backward(state.global_steps, state.loss, self.optimizer) - #self.adapter.backward(state.global_steps, reduced_loss, self.optimizer) - #self.adapter.backward(state.global_steps, reduced_loss, self.optimizer, self.lr_scheduler) - # self.training_event.on_backward( - # state.global_steps, lm_loss, reduced_loss, self.optimizer, self.lr_scheduler) - #self.lr_scheduler.step() + + self.training_state.pure_compute_time += time.time( + ) - pure_compute_start_time + self.driver.event(Event.BACKWARD, state.global_steps, state.loss, self.optimizer, self.grad_scaler) - #self.lr_scheduler.step() def detect_training_status(self, state): config = self.config diff --git a/training/benchmarks/glm/pytorch/train/trainer_adapter.py b/training/benchmarks/glm/pytorch/train/trainer_adapter.py index 1cb02e9d9..63ab00272 100644 --- a/training/benchmarks/glm/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/glm/pytorch/train/trainer_adapter.py @@ -1,19 +1,13 @@ -import os -import sys - from torch.optim import Optimizer from torch import nn, Tensor from typing import Tuple -import optimizers try: from apex.optimizers import FusedAdam as Adam except ImportError: from torch.optim import AdamW as Adam from optimizers import FP16_Optimizer, get_optimizer_param_groups -CURR_PATH = os.path.abspath(os.path.dirname(__file__)) -sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) from driver.dist_pytorch import main_proc_print diff --git a/training/benchmarks/glm/pytorch/train/training_state.py b/training/benchmarks/glm/pytorch/train/training_state.py index 08988ba5f..eea05a0d6 100644 --- a/training/benchmarks/glm/pytorch/train/training_state.py +++ b/training/benchmarks/glm/pytorch/train/training_state.py @@ -1,3 +1,6 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") from dataclasses import dataclass import inspect import torch @@ -23,6 +26,9 @@ class TrainingState: init_time = 0 raw_train_time = 0 + no_eval_time = 0 + pure_compute_time = 0 + def status(self): if self.converged: self._status = "success" diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index 8507c2b0e..aabb35e11 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -20,10 +20,27 @@ ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | -| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机1卡 | config_A100x1x1 | 17049.92 | 0.8 | 0.8044 | 9600 | 4.66 | -| 单机2卡 | config_A100x1x2 | 8621.41 | 0.8 | 0.8065 | 4800 | 9.24 | -| 单机4卡 | config_A100x1x4 | 5386.60 | 0.8 | 0.8052 | 3000 | 18.47 | -| 单机8卡 | config_A100x1x8 | 2755.73 | 0.8 | 0.8059 | 1500 | 36.56 | -| 两机8卡 | config_A100x2x8 | 1417.68 | 0.8 | 0.8008 | 750 | 72.73 | +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | -------------------------------------------- | ------------------------------------------- | +| 任务类别 | 自然语言理解、无条件文本生成、有条件文本生成 | | +| 模型 | GLM | | +| 数据集 | superglue | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | acc,见“性能指标” | 准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------ | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | +| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | diff --git a/training/nvidia/glm-pytorch/config/config_A100x1x1.py b/training/nvidia/glm-pytorch/config/config_A100x1x1.py index 2cb538394..3c1125e57 100644 --- a/training/nvidia/glm-pytorch/config/config_A100x1x1.py +++ b/training/nvidia/glm-pytorch/config/config_A100x1x1.py @@ -1,5 +1,7 @@ -train_batch_size = 8 -eval_batch_size = 8 +train_batch_size = 16 +eval_batch_size = 16 + +max_samples_termination = 24135 dist_backend = "nccl" From 540bfdade4dab0dddfb57b0ea9f2cdc5923a2747 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Tue, 8 Aug 2023 13:32:08 +0800 Subject: [PATCH 2/7] style --- training/benchmarks/glm/pytorch/train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/benchmarks/glm/pytorch/train/trainer.py b/training/benchmarks/glm/pytorch/train/trainer.py index f0a9ea1c5..71370917d 100644 --- a/training/benchmarks/glm/pytorch/train/trainer.py +++ b/training/benchmarks/glm/pytorch/train/trainer.py @@ -120,7 +120,7 @@ def train_one_epoch(self, dataloader): state.global_steps += 1 # TODO: Maybe we should update num_trained_samples after all epochs. state.num_trained_samples = state.global_steps * \ - dist_pytorch.global_batch_size(self.config) + dist_pytorch.global_batch_size(self.config) driver.event(Event.STEP_BEGIN, step=state.global_steps) self.train_one_step(batch) From 1b5ba817ec4cbf42dc725e95252cd82b7fcc1fe8 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Tue, 8 Aug 2023 13:36:59 +0800 Subject: [PATCH 3/7] glm: add 1x1 --- training/nvidia/glm-pytorch/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index aabb35e11..411d19783 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -40,7 +40,8 @@ * 性能指标 -| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | -| ------------------ | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | -| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | +| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | +| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | From 2575c0aa94a490ebca1a13c9224a7a88ebe1e634 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Tue, 8 Aug 2023 13:57:54 +0800 Subject: [PATCH 4/7] add MFU --- training/nvidia/glm-pytorch/README.md | 41 ++++++++++++++------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index 411d19783..0cb73db91 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -22,26 +22,27 @@ ### 运行情况 * 通用指标 -| 指标名称 | 指标值 | 特殊说明 | -| -------------- | -------------------------------------------- | ------------------------------------------- | -| 任务类别 | 自然语言理解、无条件文本生成、有条件文本生成 | | -| 模型 | GLM | | -| 数据集 | superglue | | -| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | -| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | -| 硬件设备简称 | nvidia A100 | | -| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | -| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | -| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | -| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | -| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | -| 训练结果 | acc,见“性能指标” | 准确率 | -| 额外修改项 | 无 | | +| 指标名称 | 指标值 | 特殊说明 | +| ---------------- | -------------------------------------------- | ------------------------------------------- | +| 任务类别 | 自然语言理解、无条件文本生成、有条件文本生成 | | +| 模型 | GLM | | +| 数据集 | superglue | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 训练结果 | acc,见“性能指标” | 准确率 | +| 额外修改项 | 无 | | * 性能指标 -| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | -| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | -| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | -| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | MFU | +| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | +| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | 0.275 | +| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | 0.282 | +| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | | From b38f10d4f6e313bde6d976c8963c4787c9851f51 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Sat, 12 Aug 2023 09:38:29 +0800 Subject: [PATCH 5/7] add MFU annotation for case readme --- training/nvidia/glm-pytorch/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index 0cb73db91..3d5cf0e78 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -43,6 +43,8 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | MFU | | ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | -| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | 0.275 | -| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | 0.282 | +| A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | 0.035 | +| A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | 0.035 | | A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | | + +> 注:使用GLMForMultiTokenCloze进行forward计算你,得到MFU=0.04, 使用GLMModel模型forward计算,得到MFU=0.08. 本模型的MFU值偏低是由于原始模型的MFU较低。 \ No newline at end of file From 4c38f32526d5f534f88f325ef537c580d1d5e3da Mon Sep 17 00:00:00 2001 From: zhouyu Date: Fri, 18 Aug 2023 09:58:00 +0800 Subject: [PATCH 6/7] add e2e_time for GLM 1x1 --- training/nvidia/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index 3d5cf0e78..127ca2827 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -45,6 +45,6 @@ | ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | | A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | 0.035 | | A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | 0.035 | -| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | | +| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | 1169 | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | 0.036 | > 注:使用GLMForMultiTokenCloze进行forward计算你,得到MFU=0.04, 使用GLMModel模型forward计算,得到MFU=0.08. 本模型的MFU值偏低是由于原始模型的MFU较低。 \ No newline at end of file From 3db8a46d065452080344dc3006d7a3120d765570 Mon Sep 17 00:00:00 2001 From: zhouyu Date: Fri, 18 Aug 2023 13:27:48 +0800 Subject: [PATCH 7/7] update 1x1 e2e_time to about 2h --- training/nvidia/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/nvidia/glm-pytorch/README.md b/training/nvidia/glm-pytorch/README.md index 127ca2827..63d72d0a7 100644 --- a/training/nvidia/glm-pytorch/README.md +++ b/training/nvidia/glm-pytorch/README.md @@ -45,6 +45,6 @@ | ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | | A100单机8卡(1x8) | fp32 | / | 2763 | 36.5 | 42.4 | 42.4 | 0.808 | 33.0/40.0 | 0.035 | | A100单机8卡(1x8) | fp32 | bs=16, lr=1e-05 | 2688 | 37.4 | 43.5 | 43.5 | 0.801 | 39.5/40.0 | 0.035 | -| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | 1169 | 0.35 | 5.5 | 5.5 | | 35.0/40.0 | 0.036 | +| A100单机单卡(1x1) | fp32 | bs=16, lr=1e-05 | 7695 | 4.2 | 5.5 | 5.5 | | 35.0/40.0 | 0.036 | > 注:使用GLMForMultiTokenCloze进行forward计算你,得到MFU=0.04, 使用GLMModel模型forward计算,得到MFU=0.08. 本模型的MFU值偏低是由于原始模型的MFU较低。 \ No newline at end of file