From cbe59fc75f84c37a2b583ec03a8aa181278a68b8 Mon Sep 17 00:00:00 2001
From: zhangsanfeng2022 <99704344+zhangsanfeng2022@users.noreply.github.com>
Date: Mon, 27 Nov 2023 13:13:39 +0800
Subject: [PATCH 01/11] [kunlunxin] Longformer update config (#329)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 【kunlunxin】Longformer update config

* update memory used for kunlunxin and solve file conflict.

* update 2x8 memory used.

* update kunlunxin docs.

* update kunlunxin 1x1 memory docs.

* Change to a larger batch size.

* Remove empty line in test_config.py

* Update README.

* Update test_conf.py, add comma

---------

Co-authored-by: zhangsanfeng2022 <zhangsanfeng2022@github.com>
Co-authored-by: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Co-authored-by: Zhou Yu <zycosmos@gmail.com>
---
 .../kunlunxin/longformer-pytorch/README.md    | 46 +++++++++++++++++++
 .../config/config_R300x1x1.py                 |  4 ++
 .../config/config_R300x1x8.py                 |  4 ++
 .../config/config_R300x2x8.py                 |  4 ++
 .../config/config_common.py                   |  5 ++
 .../config/environment_variables.sh           |  4 ++
 .../config/requirements.txt                   |  4 ++
 .../longformer-pytorch/extern/.gitkeep        |  0
 training/run_benchmarks/config/test_conf.py   |  3 +-
 9 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 training/kunlunxin/longformer-pytorch/README.md
 create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py
 create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py
 create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py
 create mode 100644 training/kunlunxin/longformer-pytorch/config/config_common.py
 create mode 100644 training/kunlunxin/longformer-pytorch/config/environment_variables.sh
 create mode 100644 training/kunlunxin/longformer-pytorch/config/requirements.txt
 create mode 100644 training/kunlunxin/longformer-pytorch/extern/.gitkeep

diff --git a/training/kunlunxin/longformer-pytorch/README.md b/training/kunlunxin/longformer-pytorch/README.md
new file mode 100644
index 000000000..7a7a6ba27
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/README.md
@@ -0,0 +1,46 @@
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/longformer/README.md#测试数据集下载)
+
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：pytorch1.12.1-cpu-ubuntu20.04:v0.01
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-1.12.1+cpu
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | 情感分析，意图监测，主题分类            |                                             |
+| 模型           | longformer-base-4096 model      |                                             |
+| 数据集         | longformer_train               |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | R300             |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| 计算吞吐量     | p_core,见“性能指标” | 不包含数据IO部分的耗时(p3>p2>p1)，单位为samples/s(seq_length=1024)|
+| 训练结果       | acc,见“性能指标”    | acc任务准确率                         |                                      |
+
+* 性能指标
+
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | acc | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- |
+| R300单机单卡（1x1） |  fp32     | bs=16,lr=5e-05 |     |     |     |  |       | 28.0/32.0 |
+| R300单机8卡（1x8）  |  fp32     | bs=16,lr=5e-05 |     |     |     |  |  0.64 | 30.0/32.0 |
+| R300两机8卡（2x8）  |  fp32     | bs=16,lr=5e-05 |     |     |     |  |       | 27.6/32.0 |
+
diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py b/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py
new file mode 100644
index 000000000..767ddd4b3
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 16
+gradient_accumulation_steps = 1
diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py b/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py
new file mode 100644
index 000000000..767ddd4b3
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 16
+gradient_accumulation_steps = 1
diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py b/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py
new file mode 100644
index 000000000..767ddd4b3
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 16
+gradient_accumulation_steps = 1
diff --git a/training/kunlunxin/longformer-pytorch/config/config_common.py b/training/kunlunxin/longformer-pytorch/config/config_common.py
new file mode 100644
index 000000000..71bcad9df
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/config_common.py
@@ -0,0 +1,5 @@
+vendor = 'kunlunxin'
+
+fp16 = False
+
+dist_backend = "xccl"
diff --git a/training/kunlunxin/longformer-pytorch/config/environment_variables.sh b/training/kunlunxin/longformer-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..002a70e4d
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/environment_variables.sh
@@ -0,0 +1,4 @@
+export XACC_ENABLE=1
+export BKCL_PCIE_RING=1
+export XMLIR_D_FORCE_FALLBACK_STR="aten::index_add_"
+export XMLIR_XPU_EAGER_LAUNCH_SYNC_MODE=true
diff --git a/training/kunlunxin/longformer-pytorch/config/requirements.txt b/training/kunlunxin/longformer-pytorch/config/requirements.txt
new file mode 100644
index 000000000..325729698
--- /dev/null
+++ b/training/kunlunxin/longformer-pytorch/config/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.34.1
+datasets==2.14.6
+psutil==5.9.6
+accelerate==0.24.0
diff --git a/training/kunlunxin/longformer-pytorch/extern/.gitkeep b/training/kunlunxin/longformer-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 1d88f659a..ebb0f3f39 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -103,7 +103,7 @@
     # "gpt3_13B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/gpt-3/"
 
     # kunlunxin cases
-    # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2"
+    # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2",
     # "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch:R300:1:8:1": "/raid/dataset/coco2017/",
     # "retinanet:pytorch:R300:1:8:1": "/raid/dataset/coco2017/",
@@ -113,6 +113,7 @@
     # "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "mobilenetv2:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train",
+    # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train",
     # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/",
     # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/"
 }

From 1894ba5e2c2527ca84784838211704a32a57186b Mon Sep 17 00:00:00 2001
From: Irvin Dewees <62273738+cloud9wj@users.noreply.github.com>
Date: Tue, 28 Nov 2023 16:48:19 +0800
Subject: [PATCH 02/11] [iluvatar] tacotron2 add 1x1, 2x8 configs (#341)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: 魏杰 <jie.wei@iluvatar.com>
---
 training/iluvatar/tacotron2-pytorch/README.md            | 6 ++++--
 .../tacotron2-pytorch/config/config_BI-V100x1x1.py       | 9 +++++++++
 .../tacotron2-pytorch/config/config_BI-V100x1x8.py       | 2 +-
 .../tacotron2-pytorch/config/config_BI-V100x2x8.py       | 9 +++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py
 create mode 100644 training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py

diff --git a/training/iluvatar/tacotron2-pytorch/README.md b/training/iluvatar/tacotron2-pytorch/README.md
index c996efa9b..559087d1d 100644
--- a/training/iluvatar/tacotron2-pytorch/README.md
+++ b/training/iluvatar/tacotron2-pytorch/README.md
@@ -27,7 +27,7 @@
 | 数据集         | LJSpeech                |                                             |
 | 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/tf32                      |
 | 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
-| 硬件设备简称   | nvidia A100             |                                             |
+| 硬件设备简称   | Iluvatar BI-V100             |                                             |
 | 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
 | 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
 | 总吞吐量       | p_whole,见“性能指标”    | 实际训练样本数除以总时间(performance_whole) |
@@ -40,5 +40,7 @@
 
 | 配置               | precision|    fix_hp       | e2e_time | p_whole | p_train | p_core | val_loss | mem       |
 |--------------------| ---------| ----------------| ---------| ------- | ------- | ------ | -------- | --------- |
-| BI100单机8卡(1x8)  | tf32     | bs=64, lr=0.001 | 41220    | 33082    | 33289   | 33511  | 0.4833  | 18.4/32.0  |
+| BI-V100单机8卡(1x8)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | 0.4848  | 28/32.0  |
+| BI-V100双机8卡(2x8)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | /  | 26.5/32.0  |
+| BI-V100单机单卡(1x1)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | /  | 25.3/32.0  |
 
diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py
new file mode 100644
index 000000000..8b4f227ff
--- /dev/null
+++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 96
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333
diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py
index 41e9dd5c7..8b4f227ff 100644
--- a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py
@@ -1,6 +1,6 @@
 from config_common import *
 
-train_batch_size = 64
+train_batch_size = 96
 eval_batch_size = train_batch_size
 
 warmup = 0.2
diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py
new file mode 100644
index 000000000..8b4f227ff
--- /dev/null
+++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 96
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333

From 47c52d765816aefd734505169de81e43cfa391f1 Mon Sep 17 00:00:00 2001
From: forestlee95 <sanmudaxia@gmail.com>
Date: Tue, 28 Nov 2023 21:43:24 +0800
Subject: [PATCH 03/11] llama2 7B case update

---
 .../llama2_7b/deepspeed/run_pretraining.py    |  2 +
 .../docker_image/deepspeed/Dockerfile         | 65 ++++++++++++++
 .../deepspeed/deepspeed_install.sh            | 28 ++++++
 .../docker_image/deepspeed/packages/README.md | 16 ++++
 .../deepspeed/sdk_installers/README.md        |  8 ++
 .../iluvatar/llama2_7b-deepspeed/README.md    | 51 +++++++++++
 .../config/config_BI-V100x1x8.py              |  6 ++
 .../config}/ds_config.json                    | 90 ++++++++++---------
 .../config/requirements.txt                   |  2 +
 .../deepspeed/start_deepspeed_task.py         |  7 +-
 10 files changed, 234 insertions(+), 41 deletions(-)
 create mode 100644 training/iluvatar/docker_image/deepspeed/Dockerfile
 create mode 100644 training/iluvatar/docker_image/deepspeed/deepspeed_install.sh
 create mode 100644 training/iluvatar/docker_image/deepspeed/packages/README.md
 create mode 100644 training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
 create mode 100644 training/iluvatar/llama2_7b-deepspeed/README.md
 create mode 100644 training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
 rename training/{benchmarks/llama2_7b/deepspeed => iluvatar/llama2_7b-deepspeed/config}/ds_config.json (53%)
 create mode 100644 training/iluvatar/llama2_7b-deepspeed/config/requirements.txt

diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
index 10ae55779..1192d1bb7 100644
--- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
+++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -77,6 +77,8 @@ def get_deepspeed_engine(args, model_config_dir, flashattn):
                              mem_efficient_linear=False,
                              mpu=None):
         model = get_llama_model(model_config_dir, flashattn)
+        
+    model.gradient_checkpointing_enable()
 
     model_engine, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
diff --git a/training/iluvatar/docker_image/deepspeed/Dockerfile b/training/iluvatar/docker_image/deepspeed/Dockerfile
new file mode 100644
index 000000000..aa1016c0a
--- /dev/null
+++ b/training/iluvatar/docker_image/deepspeed/Dockerfile
@@ -0,0 +1,65 @@
+FROM ubuntu:20.04
+
+# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
+ADD sources.list /etc/apt/
+
+RUN /bin/bash -c "source /root/.bashrc"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
+RUN apt-get update -y
+RUN apt-get install -y --fix-missing \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.8-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl \
+     ninja-build \
+     gcc-7 \
+     g++-7 \
+     libncursesw5
+     
+
+# Configure anaconda
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no && \
+    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
+    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
+
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7
+
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/corex/bin:${PATH}"
+ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
diff --git a/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh b/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh
new file mode 100644
index 000000000..0f82fbb21
--- /dev/null
+++ b/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+SDK_DIR="/workspace/docker_image/sdk_installers"
+PKG_DIR="/workspace/docker_image/packages"
+
+search_cuda_results=`find ${SDK_DIR} -name "*cuda*10.2*.run"`
+for installer in $search_cuda_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
+for installer in $search_sdk_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+
+torch_packages_results=`find ${PKG_DIR} -name "torch-*.whl"`
+if [ -n "$torch_packages_results" ]; then    
+    pip3 install "$torch_packages_results"
+fi
+
+search_packages_results=`find ${PKG_DIR} -name "*.whl"`
+for pkg in $search_packages_results; do
+    echo "Install ${pkg}"
+    pip3 install "${pkg}"
+done
diff --git a/training/iluvatar/docker_image/deepspeed/packages/README.md b/training/iluvatar/docker_image/deepspeed/packages/README.md
new file mode 100644
index 000000000..76ff33044
--- /dev/null
+++ b/training/iluvatar/docker_image/deepspeed/packages/README.md
@@ -0,0 +1,16 @@
+# 以下软件包需联系天数智芯获取
+# The iluvatar deepspeed backend is still in the development stage now.
+
+>联系邮箱: contact-us@iluvatar.com
+
+apex-0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+
+deepspeed-0.10.0+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+
+flash_attn-2.0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+
+torch-1.13.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+
+torchtext-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+
+torchvision-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
diff --git a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
new file mode 100644
index 000000000..86cd46498
--- /dev/null
+++ b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
@@ -0,0 +1,8 @@
+# 以下软件包需联系天数智芯获取
+# The iluvatar deepspeed backend is still in the development stage now.
+
+>联系邮箱: contact-us@iluvatar.com
+
+corex-installer-linux64-3.2.0.20231126.1581_x86_64_10.2.run
+
+cuda_10.2.89_440.33.01_linux.run
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
new file mode 100644
index 000000000..a5149ee79
--- /dev/null
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -0,0 +1,51 @@
+### Iluvatar GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: Iluvatar BI-V100 32GB
+
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-126-generic     
+   - 加速卡驱动版本：470.141.10
+   - Docker 版本：20.10.18
+   - 训练框架版本：deepspeed 0.9.2
+   - 依赖软件版本：sentencepiece
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP O3, DP_SIZE=8
+
+- ##### 优化策略
+
+   - flash attention
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_BI-V100x1x8.py中所写，在本case中默认为3
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_BI-V100x1x8.py中所写，在本case中默认为1024
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_7b                  |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     | amp                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | Iluvatar BI-V100                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
+| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=4  | / | 5.16 | 30/32 | / |
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
new file mode 100644
index 000000000..fcc6a3488
--- /dev/null
+++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
@@ -0,0 +1,6 @@
+seqlength = 2048
+batchsize = 4
+datafilename = "openwebtext_llama2_100M.npy"
+theoryflops = 312000000000000.0
+epochs = 1
+flashattn = True
diff --git a/training/benchmarks/llama2_7b/deepspeed/ds_config.json b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
similarity index 53%
rename from training/benchmarks/llama2_7b/deepspeed/ds_config.json
rename to training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
index 27ac45041..1da4b1301 100644
--- a/training/benchmarks/llama2_7b/deepspeed/ds_config.json
+++ b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
@@ -1,40 +1,50 @@
-{
-  "gradient_accumulation_steps": 1,
-  "train_micro_batch_size_per_gpu": 1,
-  "prescale_gradients": false,
-  "zero_allow_untested_optimizer": true,
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": 1e-5,
-      "weight_decay": 0.1,
-      "betas": [
-        0.9,
-        0.95
-      ],
-      "eps": 1e-5
-    }
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_prefetch_bucket_size": 1e7,
-    "sub_group_size": 1e9,
-    "contiguous_gradients": true,
-    "allgather_bucket_size": 1e8,
-    "reduce_bucket_size": 1e7,
-    "overlap_comm": true,
-    "reduce_scatter": true
-  },
-  "steps_per_print": 50,
-  "gradient_clipping": 1.0,
-  "wall_clock_breakdown": false,
-  "bf16": {
-    "enabled": true
-  },
-  "activation_checkpointing": {
-    "partition_activations": true,
-    "contiguous_memory_optimization": false
-  }
-}
+{
+  "gradient_accumulation_steps": 1,
+  "train_micro_batch_size_per_gpu": 1,
+  "prescale_gradients": false,
+  "zero_allow_untested_optimizer": true,
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-5,
+      "weight_decay": 0.1,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-5
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "stage3_max_live_parameters": 5e8,
+    "stage3_max_reuse_distance": 5e8,
+    "stage3_prefetch_bucket_size": 1e8,
+    "sub_group_size": 8e8,
+    "allgather_bucket_size": 2e8,
+    "reduce_bucket_size": 2e8,
+    "reduce_scatter": true,
+    "stage3_gather_16bit_weights_on_model_save": false
+  },
+  "steps_per_print": 10,
+  "gradient_clipping": 1.0,
+  "wall_clock_breakdown": false,
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": false,
+    "auto_cast": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 1,
+    "min_loss_scale": 1
+  },  
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": false
+  }
+}
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt b/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..9c6eb827b
--- /dev/null
+++ b/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+transformers==4.34.1
diff --git a/training/run_benchmarks/deepspeed/start_deepspeed_task.py b/training/run_benchmarks/deepspeed/start_deepspeed_task.py
index f98c8ed37..154b3cd2f 100644
--- a/training/run_benchmarks/deepspeed/start_deepspeed_task.py
+++ b/training/run_benchmarks/deepspeed/start_deepspeed_task.py
@@ -114,11 +114,16 @@ def main():
     train_script_path = helper.get_train_script_path(task_args)
     config_dir, config_file = helper.get_config_dir_file(task_args)
     config_file = os.path.join(config_dir, config_file)
+    ds_config_file = os.path.join(config_dir, "ds_config.json")
 
     exec_cmd = "cd " + os.path.dirname(train_script_path) + ";"
     exec_cmd = exec_cmd + "deepspeed --num_gpus=" + str(
         task_args.nproc) + " run_pretraining.py"
-    exec_cmd = exec_cmd + " --deepspeed --deepspeed_config ds_config.json --data_dir " + task_args.data_dir
+
+    exec_cmd = exec_cmd + " --deepspeed --deepspeed_config "
+    exec_cmd = exec_cmd + ds_config_file
+    exec_cmd = exec_cmd + " --data_dir " + task_args.data_dir
+
     exec_cmd = exec_cmd + " --flagperf_config " + config_file
     exec_cmd = exec_cmd + " --nproc " + str(
         task_args.nproc) + " --nnodes " + str(task_args.nnodes)

From 3b23005b1f51f78ae589bb07046badd692e455e3 Mon Sep 17 00:00:00 2001
From: forestlee95 <sanmudaxia@gmail.com>
Date: Tue, 28 Nov 2023 21:47:14 +0800
Subject: [PATCH 04/11] fix iluvatar deepspeed readme

---
 training/iluvatar/llama2_7b-deepspeed/README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index a5149ee79..3a60cf0cb 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -6,10 +6,8 @@
     
 - ##### 软件环境
    - OS版本：Ubuntu 20.04
-   - OS kernel版本: 5.4.0-126-generic     
-   - 加速卡驱动版本：470.141.10
    - Docker 版本：20.10.18
-   - 训练框架版本：deepspeed 0.9.2
+   - 训练框架版本：deepspeed 0.10.0
    - 依赖软件版本：sentencepiece
 
 - ##### 并行策略
@@ -20,7 +18,7 @@
 
 - ##### 优化策略
 
-   - flash attention
+   - flash attention 2
 
 ### 运行情况
 

From 761acd00009ae76df362e3cdf407ef8d186f1803 Mon Sep 17 00:00:00 2001
From: forestlee95 <sanmudaxia@gmail.com>
Date: Wed, 29 Nov 2023 11:08:45 +0800
Subject: [PATCH 05/11] fix readme && test_conf

---
 .../iluvatar/llama2_7b-deepspeed/README.md    |  2 +-
 training/run_benchmarks/config/test_conf.py   | 33 +++++++++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index 3a60cf0cb..eaa162db3 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -14,7 +14,7 @@
 
    - 并行技术：sharded data parallel
    - 实施者：deepspeed ZeRO-DP
-   - 实施细节：ZeRO-DP O3, DP_SIZE=8
+   - 实施细节：ZeRO-DP O3
 
 - ##### 优化策略
 
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index ebb0f3f39..4ff671dfe 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -3,7 +3,7 @@
 
 # Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin and ascend.
 # We will run benchmarks in training/<vendor>
-VENDOR = "nvidia"
+VENDOR = "iluvatar"
 
 # Accelerator options for docker. TODO FIXME support more accelerators.
 # possible value of ACCE_CONTAINER_OPT are:
@@ -19,7 +19,7 @@
 #       "--device=/dev/davinciX --device=/dev/davinci_manager + \
 #        --device=/dev/devmm_svm --device=/dev/hisi_hdc + \
 #        -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi"
-ACCE_CONTAINER_OPT = " --gpus all"
+ACCE_CONTAINER_OPT = ' -v /lib/modules:/lib/modules '
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
 #   CUDA_VISIBLE_DEVICES for nvidia, iluvatar
@@ -29,12 +29,12 @@
 ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES"
 
 # Set pip source, which will be used in preparing envs in container
-PIP_SOURCE = "https://mirror.baidu.com/pypi/simple"
+PIP_SOURCE = "https://pypi.tuna.tsinghua.edu.cn/simple"
 
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
 # You can assume the preset "/home/FlagPerf/training" points to Null
-FLAGPERF_PATH = "/home/FlagPerf/training"
+FLAGPERF_PATH = "/home/yangzhichao/workspace/FlagPerf/training"
 # Set log path on the host here.
 FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"
 
@@ -54,9 +54,9 @@
 '''
 CASES = {
     # nvidia cases
-    "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
-    "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
-    "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
+    # "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
+    # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
+    # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
     # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
@@ -116,5 +116,24 @@
     # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train",
     # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/",
     # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/"
+
+    # iluvatar cases
+    # "resnet50:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "bert:pytorch:BI-V100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
+    # "mask_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017/",
+    # "bigtransfer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "cpm:pytorch:BI-V100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
+    # "efficientnet:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "faster_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017",
+    # "glm:pytorch:BI-V100:1:8:1": "/raid/dataset/glm/train/",
+    # "mask_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/maskrcnn/coco2017",
+    # "mobilenetv2:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "retinanet:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017/",
+    # "swin_transformer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "tacotron2:pytorch:BI-V100:1:8:1": "/raid/dataset/tacotron2/LJSpeech/",
+    # "transformer:pytorch:BI-V100:1:8:1": "/raid/dataset/transformer/train/",
+    # "vit:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    "llama2_7b:deepspeed:BI-V100:1:8:1":"/data1/"
+
 }
 

From a235d0ca37bc9be04e14a6c2b6b270251df38187 Mon Sep 17 00:00:00 2001
From: clveryang <1953129318@qq.com>
Date: Wed, 29 Nov 2023 11:30:49 +0800
Subject: [PATCH 06/11] fix loss number

---
 training/iluvatar/llama2_7b-deepspeed/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index eaa162db3..1f41658d0 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -46,4 +46,4 @@
 
 | 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
 | ------------------- | ---------------- | ------ | ------- | --------- | --------- |
-| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=4  | / | 5.16 | 30/32 | / |
\ No newline at end of file
+| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=4  | / | 5.1612 | 30/32 | / |
\ No newline at end of file

From 2f95601f045b4af7bc4732ef1774e81d6d8db199 Mon Sep 17 00:00:00 2001
From: clveryang <1953129318@qq.com>
Date: Wed, 29 Nov 2023 15:32:13 +0800
Subject: [PATCH 07/11] delete personal FLAGPERF_PATH

---
 training/run_benchmarks/config/test_conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 4ff671dfe..dc011cf20 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -34,7 +34,7 @@
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
 # You can assume the preset "/home/FlagPerf/training" points to Null
-FLAGPERF_PATH = "/home/yangzhichao/workspace/FlagPerf/training"
+FLAGPERF_PATH = "/home/FlagPerf/training"
 # Set log path on the host here.
 FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"
 

From d945107f84e89c482726f6ad7125d524048da043 Mon Sep 17 00:00:00 2001
From: clveryang <1953129318@qq.com>
Date: Wed, 29 Nov 2023 17:21:43 +0800
Subject: [PATCH 08/11] fix config && readme

---
 training/iluvatar/llama2_7b-deepspeed/README.md                 | 2 +-
 .../iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index 1f41658d0..303b9bc63 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -6,7 +6,7 @@
     
 - ##### 软件环境
    - OS版本：Ubuntu 20.04
-   - Docker 版本：20.10.18
+   - Docker 版本：20.10.21
    - 训练框架版本：deepspeed 0.10.0
    - 依赖软件版本：sentencepiece
 
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
index fcc6a3488..4f001d02a 100644
--- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
@@ -1,6 +1,6 @@
 seqlength = 2048
 batchsize = 4
 datafilename = "openwebtext_llama2_100M.npy"
-theoryflops = 312000000000000.0
+theoryflops = 128000000000000.0
 epochs = 1
 flashattn = True

From f82068de4678995f65f092f79c0a58ec07c45a45 Mon Sep 17 00:00:00 2001
From: clveryang <1953129318@qq.com>
Date: Thu, 30 Nov 2023 14:26:04 +0800
Subject: [PATCH 09/11] add new optimization method in readme

---
 training/iluvatar/llama2_7b-deepspeed/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index 303b9bc63..1c86cd0ba 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -19,6 +19,7 @@
 - ##### 优化策略
 
    - flash attention 2
+   - checkpointing
 
 ### 运行情况
 

From 87f9eaf7d52b0e9683f802e7240ab9aaec6b741d Mon Sep 17 00:00:00 2001
From: clveryang <yangclver@gmail.com>
Date: Fri, 15 Dec 2023 10:54:35 +0800
Subject: [PATCH 10/11] add new seqlength and batchsize config

---
 .../llama2_7b/deepspeed/run_pretraining.py         | 14 +++++++-------
 .../docker_image/deepspeed/packages/README.md      | 12 ++++++------
 .../deepspeed/sdk_installers/README.md             |  2 +-
 training/iluvatar/llama2_7b-deepspeed/README.md    |  3 ++-
 .../config/config_BI-V100x1x8.py                   |  8 +++++---
 .../llama2_7b-deepspeed/config/ds_config.json      | 10 +++++-----
 6 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
index 1192d1bb7..dbbf4e599 100644
--- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
+++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -63,7 +63,6 @@ def train(model_engine, dataloader):
         loss = model_engine(input_ids=input_ids, labels=labels).loss
         model_engine.backward(loss)
         model_engine.step()
-
         ave_loss += loss
         if step % 10 == 0 and args.local_rank == 0:
             print('Step {}/{}, Loss: {}'.format(step, len(dataloader),
@@ -71,14 +70,14 @@ def train(model_engine, dataloader):
             ave_loss = 0.0
 
 
-def get_deepspeed_engine(args, model_config_dir, flashattn):
+def get_deepspeed_engine(args, model_config_dir, flashattn, gradient_checkpointing):
     with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config,
                              enabled=True,
                              mem_efficient_linear=False,
                              mpu=None):
         model = get_llama_model(model_config_dir, flashattn)
-        
-    model.gradient_checkpointing_enable()
+    if gradient_checkpointing:
+        model.gradient_checkpointing_enable()
 
     model_engine, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
@@ -96,7 +95,6 @@ def get_metric(texts):
     arg_parser = get_argument_parser()
     arg_parser = deepspeed.add_config_arguments(arg_parser)
     args = arg_parser.parse_args()
-
     flagperf_config = {}
     sys.path.append(os.path.dirname(args.flagperf_config))
     config_file = os.path.basename(args.flagperf_config).split('.')[0]
@@ -109,10 +107,12 @@ def get_metric(texts):
     theoryflops = getattr(module, 'theoryflops')
     epochs = getattr(module, 'epochs')
     flashattn = getattr(module, 'flashattn')
-
+    gradient_checkpointing = getattr(module, 'gradient_checkpointing')
+    
     deepspeed.init_distributed()
     model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"),
-                                        flashattn)
+                                        flashattn, gradient_checkpointing)
+
     dataset = get_llama_dataset(args, seqlength, datafilename)
 
     logger = logging.getLogger("DeepSpeed")
diff --git a/training/iluvatar/docker_image/deepspeed/packages/README.md b/training/iluvatar/docker_image/deepspeed/packages/README.md
index 76ff33044..9cb6a3d2c 100644
--- a/training/iluvatar/docker_image/deepspeed/packages/README.md
+++ b/training/iluvatar/docker_image/deepspeed/packages/README.md
@@ -3,14 +3,14 @@
 
 >联系邮箱: contact-us@iluvatar.com
 
-apex-0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+apex-0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
 
-deepspeed-0.10.0+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+deepspeed-0.10.0+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
 
-flash_attn-2.0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+flash_attn-2.0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
 
-torch-1.13.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+torch-1.13.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
 
-torchtext-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
+torchtext-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
 
-torchvision-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
+torchvision-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
diff --git a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
index 86cd46498..36e2f06f8 100644
--- a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
+++ b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
@@ -3,6 +3,6 @@
 
 >联系邮箱: contact-us@iluvatar.com
 
-corex-installer-linux64-3.2.0.20231126.1581_x86_64_10.2.run
+corex-installer-linux64-3.2.0.20231211.1602_x86_64_10.2.run
 
 cuda_10.2.89_440.33.01_linux.run
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
index 1c86cd0ba..d88322743 100644
--- a/training/iluvatar/llama2_7b-deepspeed/README.md
+++ b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -47,4 +47,5 @@
 
 | 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
 | ------------------- | ---------------- | ------ | ------- | --------- | --------- |
-| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=4  | / | 5.1612 | 30/32 | / |
\ No newline at end of file
+| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=10  | / | 5.59 | 31/32 | / |
+| BI-V100单机8卡（1x8）  |  MPE=4096 LBS=5  | / | 5.67 | 31/32 | / |
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
index 4f001d02a..df543c61e 100644
--- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
@@ -1,6 +1,8 @@
-seqlength = 2048
-batchsize = 4
+seqlength = 4096
+batchsize = 5
 datafilename = "openwebtext_llama2_100M.npy"
-theoryflops = 128000000000000.0
+theoryflops = 64000000000000.0
 epochs = 1
 flashattn = True
+gradient_checkpointing = True
+use_cache = false
\ No newline at end of file
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
index 1da4b1301..7d6a30d1f 100644
--- a/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
+++ b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json
@@ -17,12 +17,12 @@
   },
   "zero_optimization": {
     "stage": 3,
-    "overlap_comm": true,
+    "overlap_comm": false,
     "contiguous_gradients": true,
-    "stage3_max_live_parameters": 5e8,
-    "stage3_max_reuse_distance": 5e8,
-    "stage3_prefetch_bucket_size": 1e8,
-    "sub_group_size": 8e8,
+    "stage3_max_live_parameters": 5e7,
+    "stage3_max_reuse_distance": 5e7,
+    "stage3_prefetch_bucket_size": 1e7,
+    "sub_group_size": 8e7,
     "allgather_bucket_size": 2e8,
     "reduce_bucket_size": 2e8,
     "reduce_scatter": true,

From 4c547de131868753d7c3d27f64855fb2d2d92454 Mon Sep 17 00:00:00 2001
From: clveryang <yangclver@gmail.com>
Date: Fri, 15 Dec 2023 14:51:47 +0800
Subject: [PATCH 11/11] fix config error and add new test method about
 tps/tflops

---
 training/benchmarks/llama2_7b/deepspeed/run_pretraining.py     | 3 +++
 .../iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
index dbbf4e599..bae5472e0 100644
--- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
+++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -140,4 +140,7 @@ def get_metric(texts):
             chip_tps = whole_tps / args.nproc * args.nnodes
             print("System tokens per second: ", whole_tps)
             print("Tokens/p/s: ", chip_tps)
+            TFLOPS = int(theoryflops/1000000000000) 
+            print("Theory TFLOPS: ", TFLOPS) 
+            print("Tokens/TFLOPS: ", chip_tps / TFLOPS) 
             print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
index df543c61e..b8844f19e 100644
--- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
@@ -5,4 +5,4 @@
 epochs = 1
 flashattn = True
 gradient_checkpointing = True
-use_cache = false
\ No newline at end of file
+use_cache = False
\ No newline at end of file