From cbe59fc75f84c37a2b583ec03a8aa181278a68b8 Mon Sep 17 00:00:00 2001 From: zhangsanfeng2022 <99704344+zhangsanfeng2022@users.noreply.github.com> Date: Mon, 27 Nov 2023 13:13:39 +0800 Subject: [PATCH 01/11] [kunlunxin] Longformer update config (#329) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 【kunlunxin】Longformer update config * update memory used for kunlunxin and solve file conflict. * update 2x8 memory used. * update kunlunxin docs. * update kunlunxin 1x1 memory docs. * Change to a larger batch size. * Remove empty line in test_config.py * Update README. * Update test_conf.py, add comma --------- Co-authored-by: zhangsanfeng2022 Co-authored-by: root Co-authored-by: Zhou Yu --- .../kunlunxin/longformer-pytorch/README.md | 46 +++++++++++++++++++ .../config/config_R300x1x1.py | 4 ++ .../config/config_R300x1x8.py | 4 ++ .../config/config_R300x2x8.py | 4 ++ .../config/config_common.py | 5 ++ .../config/environment_variables.sh | 4 ++ .../config/requirements.txt | 4 ++ .../longformer-pytorch/extern/.gitkeep | 0 training/run_benchmarks/config/test_conf.py | 3 +- 9 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 training/kunlunxin/longformer-pytorch/README.md create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py create mode 100644 training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py create mode 100644 training/kunlunxin/longformer-pytorch/config/config_common.py create mode 100644 training/kunlunxin/longformer-pytorch/config/environment_variables.sh create mode 100644 training/kunlunxin/longformer-pytorch/config/requirements.txt create mode 100644 training/kunlunxin/longformer-pytorch/extern/.gitkeep diff --git a/training/kunlunxin/longformer-pytorch/README.md b/training/kunlunxin/longformer-pytorch/README.md new file mode 100644 index 000000000..7a7a6ba27 --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/README.md @@ -0,0 +1,46 @@ +### 测试数据集下载 +[测试数据集下载](../../benchmarks/longformer/README.md#测试数据集下载) + +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir + - 训练编译器版本:xacc + - 依赖软件版本:pytorch-1.12.1+cpu + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 情感分析,意图监测,主题分类 | | +| 模型 | longformer-base-4096 model | | +| 数据集 | longformer_train | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| 计算吞吐量 | p_core,见“性能指标” | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=1024)| +| 训练结果 | acc,见“性能指标” | acc任务准确率 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- | +| R300单机单卡(1x1) | fp32 | bs=16,lr=5e-05 | | | | | | 28.0/32.0 | +| R300单机8卡(1x8) | fp32 | bs=16,lr=5e-05 | | | | | 0.64 | 30.0/32.0 | +| R300两机8卡(2x8) | fp32 | bs=16,lr=5e-05 | | | | | | 27.6/32.0 | + diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py b/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..767ddd4b3 --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/config_R300x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 16 +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py b/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py new file mode 100644 index 000000000..767ddd4b3 --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/config_R300x1x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 16 +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py b/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..767ddd4b3 --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/config_R300x2x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 16 +gradient_accumulation_steps = 1 diff --git a/training/kunlunxin/longformer-pytorch/config/config_common.py b/training/kunlunxin/longformer-pytorch/config/config_common.py new file mode 100644 index 000000000..71bcad9df --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/config_common.py @@ -0,0 +1,5 @@ +vendor = 'kunlunxin' + +fp16 = False + +dist_backend = "xccl" diff --git a/training/kunlunxin/longformer-pytorch/config/environment_variables.sh b/training/kunlunxin/longformer-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..002a70e4d --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/environment_variables.sh @@ -0,0 +1,4 @@ +export XACC_ENABLE=1 +export BKCL_PCIE_RING=1 +export XMLIR_D_FORCE_FALLBACK_STR="aten::index_add_" +export XMLIR_XPU_EAGER_LAUNCH_SYNC_MODE=true diff --git a/training/kunlunxin/longformer-pytorch/config/requirements.txt b/training/kunlunxin/longformer-pytorch/config/requirements.txt new file mode 100644 index 000000000..325729698 --- /dev/null +++ b/training/kunlunxin/longformer-pytorch/config/requirements.txt @@ -0,0 +1,4 @@ +transformers==4.34.1 +datasets==2.14.6 +psutil==5.9.6 +accelerate==0.24.0 diff --git a/training/kunlunxin/longformer-pytorch/extern/.gitkeep b/training/kunlunxin/longformer-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 1d88f659a..ebb0f3f39 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -103,7 +103,7 @@ # "gpt3_13B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/gpt-3/" # kunlunxin cases - # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2" + # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2", # "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "mask_rcnn:pytorch:R300:1:8:1": "/raid/dataset/coco2017/", # "retinanet:pytorch:R300:1:8:1": "/raid/dataset/coco2017/", @@ -113,6 +113,7 @@ # "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "mobilenetv2:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train", + # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train", # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/", # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/" } From 1894ba5e2c2527ca84784838211704a32a57186b Mon Sep 17 00:00:00 2001 From: Irvin Dewees <62273738+cloud9wj@users.noreply.github.com> Date: Tue, 28 Nov 2023 16:48:19 +0800 Subject: [PATCH 02/11] [iluvatar] tacotron2 add 1x1, 2x8 configs (#341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update * update * update * update * update * update --------- Co-authored-by: 魏杰 --- training/iluvatar/tacotron2-pytorch/README.md | 6 ++++-- .../tacotron2-pytorch/config/config_BI-V100x1x1.py | 9 +++++++++ .../tacotron2-pytorch/config/config_BI-V100x1x8.py | 2 +- .../tacotron2-pytorch/config/config_BI-V100x2x8.py | 9 +++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py create mode 100644 training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py diff --git a/training/iluvatar/tacotron2-pytorch/README.md b/training/iluvatar/tacotron2-pytorch/README.md index c996efa9b..559087d1d 100644 --- a/training/iluvatar/tacotron2-pytorch/README.md +++ b/training/iluvatar/tacotron2-pytorch/README.md @@ -27,7 +27,7 @@ | 数据集 | LJSpeech | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | -| 硬件设备简称 | nvidia A100 | | +| 硬件设备简称 | Iluvatar BI-V100 | | | 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | | 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | @@ -40,5 +40,7 @@ | 配置 | precision| fix_hp | e2e_time | p_whole | p_train | p_core | val_loss | mem | |--------------------| ---------| ----------------| ---------| ------- | ------- | ------ | -------- | --------- | -| BI100单机8卡(1x8) | tf32 | bs=64, lr=0.001 | 41220 | 33082 | 33289 | 33511 | 0.4833 | 18.4/32.0 | +| BI-V100单机8卡(1x8) | tf32 | bs=96, lr=0.001 | / | / | / | / | 0.4848 | 28/32.0 | +| BI-V100双机8卡(2x8) | tf32 | bs=96, lr=0.001 | / | / | / | / | / | 26.5/32.0 | +| BI-V100单机单卡(1x1) | tf32 | bs=96, lr=0.001 | / | / | / | / | / | 25.3/32.0 | diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py new file mode 100644 index 000000000..8b4f227ff --- /dev/null +++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x1.py @@ -0,0 +1,9 @@ +from config_common import * + +train_batch_size = 96 +eval_batch_size = train_batch_size + +warmup = 0.2 +learning_rate = 1e-3 + +seed = 23333 diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py index 41e9dd5c7..8b4f227ff 100644 --- a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py +++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x1x8.py @@ -1,6 +1,6 @@ from config_common import * -train_batch_size = 64 +train_batch_size = 96 eval_batch_size = train_batch_size warmup = 0.2 diff --git a/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py new file mode 100644 index 000000000..8b4f227ff --- /dev/null +++ b/training/iluvatar/tacotron2-pytorch/config/config_BI-V100x2x8.py @@ -0,0 +1,9 @@ +from config_common import * + +train_batch_size = 96 +eval_batch_size = train_batch_size + +warmup = 0.2 +learning_rate = 1e-3 + +seed = 23333 From 47c52d765816aefd734505169de81e43cfa391f1 Mon Sep 17 00:00:00 2001 From: forestlee95 Date: Tue, 28 Nov 2023 21:43:24 +0800 Subject: [PATCH 03/11] llama2 7B case update --- .../llama2_7b/deepspeed/run_pretraining.py | 2 + .../docker_image/deepspeed/Dockerfile | 65 ++++++++++++++ .../deepspeed/deepspeed_install.sh | 28 ++++++ .../docker_image/deepspeed/packages/README.md | 16 ++++ .../deepspeed/sdk_installers/README.md | 8 ++ .../iluvatar/llama2_7b-deepspeed/README.md | 51 +++++++++++ .../config/config_BI-V100x1x8.py | 6 ++ .../config}/ds_config.json | 90 ++++++++++--------- .../config/requirements.txt | 2 + .../deepspeed/start_deepspeed_task.py | 7 +- 10 files changed, 234 insertions(+), 41 deletions(-) create mode 100644 training/iluvatar/docker_image/deepspeed/Dockerfile create mode 100644 training/iluvatar/docker_image/deepspeed/deepspeed_install.sh create mode 100644 training/iluvatar/docker_image/deepspeed/packages/README.md create mode 100644 training/iluvatar/docker_image/deepspeed/sdk_installers/README.md create mode 100644 training/iluvatar/llama2_7b-deepspeed/README.md create mode 100644 training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py rename training/{benchmarks/llama2_7b/deepspeed => iluvatar/llama2_7b-deepspeed/config}/ds_config.json (53%) create mode 100644 training/iluvatar/llama2_7b-deepspeed/config/requirements.txt diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py index 10ae55779..1192d1bb7 100644 --- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py +++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py @@ -77,6 +77,8 @@ def get_deepspeed_engine(args, model_config_dir, flashattn): mem_efficient_linear=False, mpu=None): model = get_llama_model(model_config_dir, flashattn) + + model.gradient_checkpointing_enable() model_engine, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) diff --git a/training/iluvatar/docker_image/deepspeed/Dockerfile b/training/iluvatar/docker_image/deepspeed/Dockerfile new file mode 100644 index 000000000..aa1016c0a --- /dev/null +++ b/training/iluvatar/docker_image/deepspeed/Dockerfile @@ -0,0 +1,65 @@ +FROM ubuntu:20.04 + +# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source +ADD sources.list /etc/apt/ + +RUN /bin/bash -c "source /root/.bashrc" + +ENV DEBIAN_FRONTEND=noninteractive +ENV PATH /root/miniconda/bin:$PATH + +RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list +RUN apt-get update -y +RUN apt-get install -y --fix-missing \ + apt-utils \ + sudo \ + openssh-server \ + vim \ + git \ + curl \ + wget \ + tree \ + perl \ + kmod \ + make \ + pciutils \ + build-essential \ + python3.8-dev \ + python3-pip \ + libjpeg-dev \ + zlib1g-dev \ + unzip \ + cmake \ + bzip2 \ + cabextract \ + iputils-ping \ + pbzip2 \ + pv \ + numactl \ + ninja-build \ + gcc-7 \ + g++-7 \ + libncursesw5 + + +# Configure anaconda +RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \ + bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \ + /root/miniconda/bin/conda clean -tipsy && \ + ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc && \ + conda config --set always_yes yes --set changeps1 no && \ + echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \ + echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc + + +RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`" + +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7 + +RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" + +ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}" +ENV PATH="/usr/local/corex/bin:${PATH}" +ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024" diff --git a/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh b/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh new file mode 100644 index 000000000..0f82fbb21 --- /dev/null +++ b/training/iluvatar/docker_image/deepspeed/deepspeed_install.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +SDK_DIR="/workspace/docker_image/sdk_installers" +PKG_DIR="/workspace/docker_image/packages" + +search_cuda_results=`find ${SDK_DIR} -name "*cuda*10.2*.run"` +for installer in $search_cuda_results; do + echo "Install ${installer}" + sh "${installer}" -- --silent --toolkit +done + +search_sdk_results=`find ${SDK_DIR} -name "corex*.run"` +for installer in $search_sdk_results; do + echo "Install ${installer}" + sh "${installer}" -- --silent --toolkit +done + + +torch_packages_results=`find ${PKG_DIR} -name "torch-*.whl"` +if [ -n "$torch_packages_results" ]; then + pip3 install "$torch_packages_results" +fi + +search_packages_results=`find ${PKG_DIR} -name "*.whl"` +for pkg in $search_packages_results; do + echo "Install ${pkg}" + pip3 install "${pkg}" +done diff --git a/training/iluvatar/docker_image/deepspeed/packages/README.md b/training/iluvatar/docker_image/deepspeed/packages/README.md new file mode 100644 index 000000000..76ff33044 --- /dev/null +++ b/training/iluvatar/docker_image/deepspeed/packages/README.md @@ -0,0 +1,16 @@ +# 以下软件包需联系天数智芯获取 +# The iluvatar deepspeed backend is still in the development stage now. + +>联系邮箱: contact-us@iluvatar.com + +apex-0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl + +deepspeed-0.10.0+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl + +flash_attn-2.0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl + +torch-1.13.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl + +torchtext-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl + +torchvision-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl \ No newline at end of file diff --git a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md new file mode 100644 index 000000000..86cd46498 --- /dev/null +++ b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md @@ -0,0 +1,8 @@ +# 以下软件包需联系天数智芯获取 +# The iluvatar deepspeed backend is still in the development stage now. + +>联系邮箱: contact-us@iluvatar.com + +corex-installer-linux64-3.2.0.20231126.1581_x86_64_10.2.run + +cuda_10.2.89_440.33.01_linux.run \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md new file mode 100644 index 000000000..a5149ee79 --- /dev/null +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -0,0 +1,51 @@ +### Iluvatar GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: Iluvatar BI-V100 32GB + + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-126-generic + - 加速卡驱动版本:470.141.10 + - Docker 版本:20.10.18 + - 训练框架版本:deepspeed 0.9.2 + - 依赖软件版本:sentencepiece + +- ##### 并行策略 + + - 并行技术:sharded data parallel + - 实施者:deepspeed ZeRO-DP + - 实施细节:ZeRO-DP O3, DP_SIZE=8 + +- ##### 优化策略 + + - flash attention + +### 运行情况 + +* 输入批尺寸 + 1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_BI-V100x1x8.py中所写,在本case中默认为3 + 2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_BI-V100x1x8.py中所写,在本case中默认为1024 + 3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为1 + 4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size,简写为GBS。在本case中,只存在数据并行,因此data_parallel_size=world_size。 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| ------------ | -------------------------- | ---------------------------------- | +| 任务类别 | 自然语言理解 | | +| 模型 | llama2_7b | | +| 数据集 | openwebtext | 如无特殊说明,训练前1亿个token | +| 数据精度 | amp | | +| 超参修改 | fix_hp,见“性能指标” | 运行必要特殊超参,例如需要改小seqlength避免OOM | +| 硬件设备简称 | Iluvatar BI-V100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 | +| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 | + +* 性能指标 + +| 配置 | fix_hp | token/p/s | loss | mem | MFU | +| ------------------- | ---------------- | ------ | ------- | --------- | --------- | +| BI-V100单机8卡(1x8) | MPE=2048 LBS=4 | / | 5.16 | 30/32 | / | \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py new file mode 100644 index 000000000..fcc6a3488 --- /dev/null +++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py @@ -0,0 +1,6 @@ +seqlength = 2048 +batchsize = 4 +datafilename = "openwebtext_llama2_100M.npy" +theoryflops = 312000000000000.0 +epochs = 1 +flashattn = True diff --git a/training/benchmarks/llama2_7b/deepspeed/ds_config.json b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json similarity index 53% rename from training/benchmarks/llama2_7b/deepspeed/ds_config.json rename to training/iluvatar/llama2_7b-deepspeed/config/ds_config.json index 27ac45041..1da4b1301 100644 --- a/training/benchmarks/llama2_7b/deepspeed/ds_config.json +++ b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json @@ -1,40 +1,50 @@ -{ - "gradient_accumulation_steps": 1, - "train_micro_batch_size_per_gpu": 1, - "prescale_gradients": false, - "zero_allow_untested_optimizer": true, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 1e-5, - "weight_decay": 0.1, - "betas": [ - 0.9, - 0.95 - ], - "eps": 1e-5 - } - }, - "zero_optimization": { - "stage": 3, - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_prefetch_bucket_size": 1e7, - "sub_group_size": 1e9, - "contiguous_gradients": true, - "allgather_bucket_size": 1e8, - "reduce_bucket_size": 1e7, - "overlap_comm": true, - "reduce_scatter": true - }, - "steps_per_print": 50, - "gradient_clipping": 1.0, - "wall_clock_breakdown": false, - "bf16": { - "enabled": true - }, - "activation_checkpointing": { - "partition_activations": true, - "contiguous_memory_optimization": false - } -} +{ + "gradient_accumulation_steps": 1, + "train_micro_batch_size_per_gpu": 1, + "prescale_gradients": false, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-5, + "weight_decay": 0.1, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-5 + } + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "stage3_max_live_parameters": 5e8, + "stage3_max_reuse_distance": 5e8, + "stage3_prefetch_bucket_size": 1e8, + "sub_group_size": 8e8, + "allgather_bucket_size": 2e8, + "reduce_bucket_size": 2e8, + "reduce_scatter": true, + "stage3_gather_16bit_weights_on_model_save": false + }, + "steps_per_print": 10, + "gradient_clipping": 1.0, + "wall_clock_breakdown": false, + "bf16": { + "enabled": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 1, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": true, + "contiguous_memory_optimization": false + } +} \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt b/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt new file mode 100644 index 000000000..9c6eb827b --- /dev/null +++ b/training/iluvatar/llama2_7b-deepspeed/config/requirements.txt @@ -0,0 +1,2 @@ +sentencepiece +transformers==4.34.1 diff --git a/training/run_benchmarks/deepspeed/start_deepspeed_task.py b/training/run_benchmarks/deepspeed/start_deepspeed_task.py index f98c8ed37..154b3cd2f 100644 --- a/training/run_benchmarks/deepspeed/start_deepspeed_task.py +++ b/training/run_benchmarks/deepspeed/start_deepspeed_task.py @@ -114,11 +114,16 @@ def main(): train_script_path = helper.get_train_script_path(task_args) config_dir, config_file = helper.get_config_dir_file(task_args) config_file = os.path.join(config_dir, config_file) + ds_config_file = os.path.join(config_dir, "ds_config.json") exec_cmd = "cd " + os.path.dirname(train_script_path) + ";" exec_cmd = exec_cmd + "deepspeed --num_gpus=" + str( task_args.nproc) + " run_pretraining.py" - exec_cmd = exec_cmd + " --deepspeed --deepspeed_config ds_config.json --data_dir " + task_args.data_dir + + exec_cmd = exec_cmd + " --deepspeed --deepspeed_config " + exec_cmd = exec_cmd + ds_config_file + exec_cmd = exec_cmd + " --data_dir " + task_args.data_dir + exec_cmd = exec_cmd + " --flagperf_config " + config_file exec_cmd = exec_cmd + " --nproc " + str( task_args.nproc) + " --nnodes " + str(task_args.nnodes) From 3b23005b1f51f78ae589bb07046badd692e455e3 Mon Sep 17 00:00:00 2001 From: forestlee95 Date: Tue, 28 Nov 2023 21:47:14 +0800 Subject: [PATCH 04/11] fix iluvatar deepspeed readme --- training/iluvatar/llama2_7b-deepspeed/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index a5149ee79..3a60cf0cb 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -6,10 +6,8 @@ - ##### 软件环境 - OS版本:Ubuntu 20.04 - - OS kernel版本: 5.4.0-126-generic - - 加速卡驱动版本:470.141.10 - Docker 版本:20.10.18 - - 训练框架版本:deepspeed 0.9.2 + - 训练框架版本:deepspeed 0.10.0 - 依赖软件版本:sentencepiece - ##### 并行策略 @@ -20,7 +18,7 @@ - ##### 优化策略 - - flash attention + - flash attention 2 ### 运行情况 From 761acd00009ae76df362e3cdf407ef8d186f1803 Mon Sep 17 00:00:00 2001 From: forestlee95 Date: Wed, 29 Nov 2023 11:08:45 +0800 Subject: [PATCH 05/11] fix readme && test_conf --- .../iluvatar/llama2_7b-deepspeed/README.md | 2 +- training/run_benchmarks/config/test_conf.py | 33 +++++++++++++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index 3a60cf0cb..eaa162db3 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -14,7 +14,7 @@ - 并行技术:sharded data parallel - 实施者:deepspeed ZeRO-DP - - 实施细节:ZeRO-DP O3, DP_SIZE=8 + - 实施细节:ZeRO-DP O3 - ##### 优化策略 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index ebb0f3f39..4ff671dfe 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -3,7 +3,7 @@ # Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin and ascend. # We will run benchmarks in training/ -VENDOR = "nvidia" +VENDOR = "iluvatar" # Accelerator options for docker. TODO FIXME support more accelerators. # possible value of ACCE_CONTAINER_OPT are: @@ -19,7 +19,7 @@ # "--device=/dev/davinciX --device=/dev/davinci_manager + \ # --device=/dev/devmm_svm --device=/dev/hisi_hdc + \ # -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi" -ACCE_CONTAINER_OPT = " --gpus all" +ACCE_CONTAINER_OPT = ' -v /lib/modules:/lib/modules ' # XXX_VISIBLE_DEVICE item name in env # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are: # CUDA_VISIBLE_DEVICES for nvidia, iluvatar @@ -29,12 +29,12 @@ ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES" # Set pip source, which will be used in preparing envs in container -PIP_SOURCE = "https://mirror.baidu.com/pypi/simple" +PIP_SOURCE = "https://pypi.tuna.tsinghua.edu.cn/simple" # The path that flagperf deploy in the cluster. # Users must set FLAGPERF_PATH to where flagperf deploy # You can assume the preset "/home/FlagPerf/training" points to Null -FLAGPERF_PATH = "/home/FlagPerf/training" +FLAGPERF_PATH = "/home/yangzhichao/workspace/FlagPerf/training" # Set log path on the host here. FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/" @@ -54,9 +54,9 @@ ''' CASES = { # nvidia cases - "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/", - "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/", - "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", + # "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/", + # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/", + # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", @@ -116,5 +116,24 @@ # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train", # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/", # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/" + + # iluvatar cases + # "resnet50:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "bert:pytorch:BI-V100:1:8:1": "/raid/home_datasets_ckpt/bert/train/", + # "mask_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017/", + # "bigtransfer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "cpm:pytorch:BI-V100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", + # "efficientnet:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "faster_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017", + # "glm:pytorch:BI-V100:1:8:1": "/raid/dataset/glm/train/", + # "mask_rcnn:pytorch:BI-V100:1:8:1": "/raid/dataset/maskrcnn/coco2017", + # "mobilenetv2:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "retinanet:pytorch:BI-V100:1:8:1": "/raid/dataset/coco2017/", + # "swin_transformer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "tacotron2:pytorch:BI-V100:1:8:1": "/raid/dataset/tacotron2/LJSpeech/", + # "transformer:pytorch:BI-V100:1:8:1": "/raid/dataset/transformer/train/", + # "vit:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + "llama2_7b:deepspeed:BI-V100:1:8:1":"/data1/" + } From a235d0ca37bc9be04e14a6c2b6b270251df38187 Mon Sep 17 00:00:00 2001 From: clveryang <1953129318@qq.com> Date: Wed, 29 Nov 2023 11:30:49 +0800 Subject: [PATCH 06/11] fix loss number --- training/iluvatar/llama2_7b-deepspeed/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index eaa162db3..1f41658d0 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -46,4 +46,4 @@ | 配置 | fix_hp | token/p/s | loss | mem | MFU | | ------------------- | ---------------- | ------ | ------- | --------- | --------- | -| BI-V100单机8卡(1x8) | MPE=2048 LBS=4 | / | 5.16 | 30/32 | / | \ No newline at end of file +| BI-V100单机8卡(1x8) | MPE=2048 LBS=4 | / | 5.1612 | 30/32 | / | \ No newline at end of file From 2f95601f045b4af7bc4732ef1774e81d6d8db199 Mon Sep 17 00:00:00 2001 From: clveryang <1953129318@qq.com> Date: Wed, 29 Nov 2023 15:32:13 +0800 Subject: [PATCH 07/11] delete personal FLAGPERF_PATH --- training/run_benchmarks/config/test_conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 4ff671dfe..dc011cf20 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -34,7 +34,7 @@ # The path that flagperf deploy in the cluster. # Users must set FLAGPERF_PATH to where flagperf deploy # You can assume the preset "/home/FlagPerf/training" points to Null -FLAGPERF_PATH = "/home/yangzhichao/workspace/FlagPerf/training" +FLAGPERF_PATH = "/home/FlagPerf/training" # Set log path on the host here. FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/" From d945107f84e89c482726f6ad7125d524048da043 Mon Sep 17 00:00:00 2001 From: clveryang <1953129318@qq.com> Date: Wed, 29 Nov 2023 17:21:43 +0800 Subject: [PATCH 08/11] fix config && readme --- training/iluvatar/llama2_7b-deepspeed/README.md | 2 +- .../iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index 1f41658d0..303b9bc63 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -6,7 +6,7 @@ - ##### 软件环境 - OS版本:Ubuntu 20.04 - - Docker 版本:20.10.18 + - Docker 版本:20.10.21 - 训练框架版本:deepspeed 0.10.0 - 依赖软件版本:sentencepiece diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py index fcc6a3488..4f001d02a 100644 --- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py +++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py @@ -1,6 +1,6 @@ seqlength = 2048 batchsize = 4 datafilename = "openwebtext_llama2_100M.npy" -theoryflops = 312000000000000.0 +theoryflops = 128000000000000.0 epochs = 1 flashattn = True From f82068de4678995f65f092f79c0a58ec07c45a45 Mon Sep 17 00:00:00 2001 From: clveryang <1953129318@qq.com> Date: Thu, 30 Nov 2023 14:26:04 +0800 Subject: [PATCH 09/11] add new optimization method in readme --- training/iluvatar/llama2_7b-deepspeed/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index 303b9bc63..1c86cd0ba 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -19,6 +19,7 @@ - ##### 优化策略 - flash attention 2 + - checkpointing ### 运行情况 From 87f9eaf7d52b0e9683f802e7240ab9aaec6b741d Mon Sep 17 00:00:00 2001 From: clveryang Date: Fri, 15 Dec 2023 10:54:35 +0800 Subject: [PATCH 10/11] add new seqlength and batchsize config --- .../llama2_7b/deepspeed/run_pretraining.py | 14 +++++++------- .../docker_image/deepspeed/packages/README.md | 12 ++++++------ .../deepspeed/sdk_installers/README.md | 2 +- training/iluvatar/llama2_7b-deepspeed/README.md | 3 ++- .../config/config_BI-V100x1x8.py | 8 +++++--- .../llama2_7b-deepspeed/config/ds_config.json | 10 +++++----- 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py index 1192d1bb7..dbbf4e599 100644 --- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py +++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py @@ -63,7 +63,6 @@ def train(model_engine, dataloader): loss = model_engine(input_ids=input_ids, labels=labels).loss model_engine.backward(loss) model_engine.step() - ave_loss += loss if step % 10 == 0 and args.local_rank == 0: print('Step {}/{}, Loss: {}'.format(step, len(dataloader), @@ -71,14 +70,14 @@ def train(model_engine, dataloader): ave_loss = 0.0 -def get_deepspeed_engine(args, model_config_dir, flashattn): +def get_deepspeed_engine(args, model_config_dir, flashattn, gradient_checkpointing): with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config, enabled=True, mem_efficient_linear=False, mpu=None): model = get_llama_model(model_config_dir, flashattn) - - model.gradient_checkpointing_enable() + if gradient_checkpointing: + model.gradient_checkpointing_enable() model_engine, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) @@ -96,7 +95,6 @@ def get_metric(texts): arg_parser = get_argument_parser() arg_parser = deepspeed.add_config_arguments(arg_parser) args = arg_parser.parse_args() - flagperf_config = {} sys.path.append(os.path.dirname(args.flagperf_config)) config_file = os.path.basename(args.flagperf_config).split('.')[0] @@ -109,10 +107,12 @@ def get_metric(texts): theoryflops = getattr(module, 'theoryflops') epochs = getattr(module, 'epochs') flashattn = getattr(module, 'flashattn') - + gradient_checkpointing = getattr(module, 'gradient_checkpointing') + deepspeed.init_distributed() model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"), - flashattn) + flashattn, gradient_checkpointing) + dataset = get_llama_dataset(args, seqlength, datafilename) logger = logging.getLogger("DeepSpeed") diff --git a/training/iluvatar/docker_image/deepspeed/packages/README.md b/training/iluvatar/docker_image/deepspeed/packages/README.md index 76ff33044..9cb6a3d2c 100644 --- a/training/iluvatar/docker_image/deepspeed/packages/README.md +++ b/training/iluvatar/docker_image/deepspeed/packages/README.md @@ -3,14 +3,14 @@ >联系邮箱: contact-us@iluvatar.com -apex-0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl +apex-0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl -deepspeed-0.10.0+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl +deepspeed-0.10.0+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl -flash_attn-2.0.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl +flash_attn-2.0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl -torch-1.13.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl +torch-1.13.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl -torchtext-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl +torchtext-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl -torchvision-0.14.1+corex.3.2.0.20231126.1581-cp38-cp38-linux_x86_64.whl \ No newline at end of file +torchvision-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl \ No newline at end of file diff --git a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md index 86cd46498..36e2f06f8 100644 --- a/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md +++ b/training/iluvatar/docker_image/deepspeed/sdk_installers/README.md @@ -3,6 +3,6 @@ >联系邮箱: contact-us@iluvatar.com -corex-installer-linux64-3.2.0.20231126.1581_x86_64_10.2.run +corex-installer-linux64-3.2.0.20231211.1602_x86_64_10.2.run cuda_10.2.89_440.33.01_linux.run \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/README.md b/training/iluvatar/llama2_7b-deepspeed/README.md index 1c86cd0ba..d88322743 100644 --- a/training/iluvatar/llama2_7b-deepspeed/README.md +++ b/training/iluvatar/llama2_7b-deepspeed/README.md @@ -47,4 +47,5 @@ | 配置 | fix_hp | token/p/s | loss | mem | MFU | | ------------------- | ---------------- | ------ | ------- | --------- | --------- | -| BI-V100单机8卡(1x8) | MPE=2048 LBS=4 | / | 5.1612 | 30/32 | / | \ No newline at end of file +| BI-V100单机8卡(1x8) | MPE=2048 LBS=10 | / | 5.59 | 31/32 | / | +| BI-V100单机8卡(1x8) | MPE=4096 LBS=5 | / | 5.67 | 31/32 | / | \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py index 4f001d02a..df543c61e 100644 --- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py +++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py @@ -1,6 +1,8 @@ -seqlength = 2048 -batchsize = 4 +seqlength = 4096 +batchsize = 5 datafilename = "openwebtext_llama2_100M.npy" -theoryflops = 128000000000000.0 +theoryflops = 64000000000000.0 epochs = 1 flashattn = True +gradient_checkpointing = True +use_cache = false \ No newline at end of file diff --git a/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json index 1da4b1301..7d6a30d1f 100644 --- a/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json +++ b/training/iluvatar/llama2_7b-deepspeed/config/ds_config.json @@ -17,12 +17,12 @@ }, "zero_optimization": { "stage": 3, - "overlap_comm": true, + "overlap_comm": false, "contiguous_gradients": true, - "stage3_max_live_parameters": 5e8, - "stage3_max_reuse_distance": 5e8, - "stage3_prefetch_bucket_size": 1e8, - "sub_group_size": 8e8, + "stage3_max_live_parameters": 5e7, + "stage3_max_reuse_distance": 5e7, + "stage3_prefetch_bucket_size": 1e7, + "sub_group_size": 8e7, "allgather_bucket_size": 2e8, "reduce_bucket_size": 2e8, "reduce_scatter": true, From 4c547de131868753d7c3d27f64855fb2d2d92454 Mon Sep 17 00:00:00 2001 From: clveryang Date: Fri, 15 Dec 2023 14:51:47 +0800 Subject: [PATCH 11/11] fix config error and add new test method about tps/tflops --- training/benchmarks/llama2_7b/deepspeed/run_pretraining.py | 3 +++ .../iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py index dbbf4e599..bae5472e0 100644 --- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py +++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py @@ -140,4 +140,7 @@ def get_metric(texts): chip_tps = whole_tps / args.nproc * args.nnodes print("System tokens per second: ", whole_tps) print("Tokens/p/s: ", chip_tps) + TFLOPS = int(theoryflops/1000000000000) + print("Theory TFLOPS: ", TFLOPS) + print("Tokens/TFLOPS: ", chip_tps / TFLOPS) print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops) diff --git a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py index df543c61e..b8844f19e 100644 --- a/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py +++ b/training/iluvatar/llama2_7b-deepspeed/config/config_BI-V100x1x8.py @@ -5,4 +5,4 @@ epochs = 1 flashattn = True gradient_checkpointing = True -use_cache = false \ No newline at end of file +use_cache = False \ No newline at end of file