FlagOpen · shh2000 · Dec 18, 2023 · Nov 27, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -63,20 +63,21 @@ def train(model_engine, dataloader):
         loss = model_engine(input_ids=input_ids, labels=labels).loss
         model_engine.backward(loss)
         model_engine.step()
-
         ave_loss += loss
         if step % 10 == 0 and args.local_rank == 0:
             print('Step {}/{}, Loss: {}'.format(step, len(dataloader),
                                                 ave_loss / 10))
             ave_loss = 0.0
 
 
-def get_deepspeed_engine(args, model_config_dir, flashattn):
+def get_deepspeed_engine(args, model_config_dir, flashattn, gradient_checkpointing):
     with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config,
                              enabled=True,
                              mem_efficient_linear=False,
                              mpu=None):
         model = get_llama_model(model_config_dir, flashattn)
+    if gradient_checkpointing:
+        model.gradient_checkpointing_enable()
 
     model_engine, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
@@ -94,7 +95,6 @@ def get_metric(texts):
     arg_parser = get_argument_parser()
     arg_parser = deepspeed.add_config_arguments(arg_parser)
     args = arg_parser.parse_args()
-
     flagperf_config = {}
     sys.path.append(os.path.dirname(args.flagperf_config))
     config_file = os.path.basename(args.flagperf_config).split('.')[0]
@@ -107,10 +107,12 @@ def get_metric(texts):
     theoryflops = getattr(module, 'theoryflops')
     epochs = getattr(module, 'epochs')
     flashattn = getattr(module, 'flashattn')
-
+    gradient_checkpointing = getattr(module, 'gradient_checkpointing')
+
     deepspeed.init_distributed()
     model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"),
-                                        flashattn)
+                                        flashattn, gradient_checkpointing)
+
     dataset = get_llama_dataset(args, seqlength, datafilename)
 
     logger = logging.getLogger("DeepSpeed")
@@ -138,4 +140,7 @@ def get_metric(texts):
             chip_tps = whole_tps / args.nproc * args.nnodes
             print("System tokens per second: ", whole_tps)
             print("Tokens/p/s: ", chip_tps)
+            TFLOPS = int(theoryflops/1000000000000) 
+            print("Theory TFLOPS: ", TFLOPS) 
+            print("Tokens/TFLOPS: ", chip_tps / TFLOPS) 
             print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
diff --git a/training/iluvatar/docker_image/deepspeed/Dockerfile b/training/iluvatar/docker_image/deepspeed/Dockerfile
@@ -0,0 +1,65 @@
+FROM ubuntu:20.04
+
+# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
+ADD sources.list /etc/apt/
+
+RUN /bin/bash -c "source /root/.bashrc"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
+RUN apt-get update -y
+RUN apt-get install -y --fix-missing \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.8-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl \
+     ninja-build \
+     gcc-7 \
+     g++-7 \
+     libncursesw5
+
+
+# Configure anaconda
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no && \
+    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
+    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
+
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7
+
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/corex/bin:${PATH}"
+ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
diff --git a/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh b/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+SDK_DIR="/workspace/docker_image/sdk_installers"
+PKG_DIR="/workspace/docker_image/packages"
+
+search_cuda_results=`find ${SDK_DIR} -name "*cuda*10.2*.run"`
+for installer in $search_cuda_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
+for installer in $search_sdk_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+
+torch_packages_results=`find ${PKG_DIR} -name "torch-*.whl"`
+if [ -n "$torch_packages_results" ]; then    
+    pip3 install "$torch_packages_results"
+fi
+
+search_packages_results=`find ${PKG_DIR} -name "*.whl"`
+for pkg in $search_packages_results; do
+    echo "Install ${pkg}"
+    pip3 install "${pkg}"
+done
diff --git a/training/iluvatar/docker_image/deepspeed/packages/README.md b/training/iluvatar/docker_image/deepspeed/packages/README.md
@@ -0,0 +1,16 @@
+# 以下软件包需联系天数智芯获取
+# The iluvatar deepspeed backend is still in the development stage now.
+
+>联系邮箱: [email protected]
+
+apex-0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
+
+deepspeed-0.10.0+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
+
+flash_attn-2.0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
+
+torch-1.13.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
+
+torchtext-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
+
+torchvision-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
diff --git a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md
@@ -0,0 +1,8 @@
+# 以下软件包需联系天数智芯获取
+# The iluvatar deepspeed backend is still in the development stage now.
+
+>联系邮箱: [email protected]
+
+corex-installer-linux64-3.2.0.20231211.1602_x86_64_10.2.run
+
+cuda_10.2.89_440.33.01_linux.run
diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md
@@ -0,0 +1,51 @@
+### Iluvatar GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: Iluvatar BI-V100 32GB
+
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - Docker 版本：20.10.21
+   - 训练框架版本：deepspeed 0.10.0
+   - 依赖软件版本：sentencepiece
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP O3
+
+- ##### 优化策略
+
+   - flash attention 2
+   - checkpointing
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_BI-V100x1x8.py中所写，在本case中默认为3
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_BI-V100x1x8.py中所写，在本case中默认为1024
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_7b                  |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     | amp                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | Iluvatar BI-V100                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
+| BI-V100单机8卡（1x8）  |  MPE=2048 LBS=10  | / | 5.59 | 31/32 | / |
+| BI-V100单机8卡（1x8）  |  MPE=4096 LBS=5  | / | 5.67 | 31/32 | / |
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py
@@ -0,0 +1,8 @@
+seqlength = 4096
+batchsize = 5
+datafilename = "openwebtext_llama2_100M.npy"
+theoryflops = 64000000000000.0
+epochs = 1
+flashattn = True
+gradient_checkpointing = True
+use_cache = False
diff --git a/...hmarks/llama2_7b/deepspeed/ds_config.json → ...llama2_7b-deepspeed/config/ds_config.json b/...hmarks/llama2_7b/deepspeed/ds_config.json → ...llama2_7b-deepspeed/config/ds_config.json
@@ -1,40 +1,50 @@
-{
-  "gradient_accumulation_steps": 1,
-  "train_micro_batch_size_per_gpu": 1,
-  "prescale_gradients": false,
-  "zero_allow_untested_optimizer": true,
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": 1e-5,
-      "weight_decay": 0.1,
-      "betas": [
-        0.9,
-        0.95
-      ],
-      "eps": 1e-5
-    }
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_prefetch_bucket_size": 1e7,
-    "sub_group_size": 1e9,
-    "contiguous_gradients": true,
-    "allgather_bucket_size": 1e8,
-    "reduce_bucket_size": 1e7,
-    "overlap_comm": true,
-    "reduce_scatter": true
-  },
-  "steps_per_print": 50,
-  "gradient_clipping": 1.0,
-  "wall_clock_breakdown": false,
-  "bf16": {
-    "enabled": true
-  },
-  "activation_checkpointing": {
-    "partition_activations": true,
-    "contiguous_memory_optimization": false
-  }
-}
+{
+  "gradient_accumulation_steps": 1,
+  "train_micro_batch_size_per_gpu": 1,
+  "prescale_gradients": false,
+  "zero_allow_untested_optimizer": true,
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-5,
+      "weight_decay": 0.1,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-5
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": false,
+    "contiguous_gradients": true,
+    "stage3_max_live_parameters": 5e7,
+    "stage3_max_reuse_distance": 5e7,
+    "stage3_prefetch_bucket_size": 1e7,
+    "sub_group_size": 8e7,
+    "allgather_bucket_size": 2e8,
+    "reduce_bucket_size": 2e8,
+    "reduce_scatter": true,
+    "stage3_gather_16bit_weights_on_model_save": false
+  },
+  "steps_per_print": 10,
+  "gradient_clipping": 1.0,
+  "wall_clock_breakdown": false,
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": false,
+    "auto_cast": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 1,
+    "min_loss_scale": 1
+  },  
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": false
+  }
+}
diff --git a/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt b/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+transformers==4.34.1
diff --git a/training/iluvatar/tacotron2-pytorch/README.md b/training/iluvatar/tacotron2-pytorch/README.md
@@ -27,7 +27,7 @@
 | 数据集         | LJSpeech                |                                             |
 | 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/tf32                      |
 | 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
-| 硬件设备简称   | nvidia A100             |                                             |
+| 硬件设备简称   | Iluvatar BI-V100             |                                             |
 | 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
 | 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
 | 总吞吐量       | p_whole,见“性能指标”    | 实际训练样本数除以总时间(performance_whole) |
@@ -40,5 +40,7 @@
 
 | 配置               | precision|    fix_hp       | e2e_time | p_whole | p_train | p_core | val_loss | mem       |
 |--------------------| ---------| ----------------| ---------| ------- | ------- | ------ | -------- | --------- |
-| BI100单机8卡(1x8)  | tf32     | bs=64, lr=0.001 | 41220    | 33082    | 33289   | 33511  | 0.4833  | 18.4/32.0  |
+| BI-V100单机8卡(1x8)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | 0.4848  | 28/32.0  |
+| BI-V100双机8卡(2x8)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | /  | 26.5/32.0  |
+| BI-V100单机单卡(1x1)  | tf32     | bs=96, lr=0.001 | /    | /    | /   | /  | /  | 25.3/32.0  |
 
diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 96
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333
diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py
@@ -1,6 +1,6 @@
 from config_common import *
 
-train_batch_size = 64
+train_batch_size = 96
 eval_batch_size = train_batch_size
 
 warmup = 0.2

diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 96
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333