Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Iluvatar] llama2 7b case #343

Merged
merged 13 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,21 @@ def train(model_engine, dataloader):
loss = model_engine(input_ids=input_ids, labels=labels).loss
model_engine.backward(loss)
model_engine.step()

ave_loss += loss
if step % 10 == 0 and args.local_rank == 0:
print('Step {}/{}, Loss: {}'.format(step, len(dataloader),
ave_loss / 10))
ave_loss = 0.0


def get_deepspeed_engine(args, model_config_dir, flashattn):
def get_deepspeed_engine(args, model_config_dir, flashattn, gradient_checkpointing):
with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config,
enabled=True,
mem_efficient_linear=False,
mpu=None):
model = get_llama_model(model_config_dir, flashattn)
if gradient_checkpointing:
model.gradient_checkpointing_enable()

model_engine, _, _, _ = deepspeed.initialize(
args=args, model=model, model_parameters=model.parameters())
Expand All @@ -94,7 +95,6 @@ def get_metric(texts):
arg_parser = get_argument_parser()
arg_parser = deepspeed.add_config_arguments(arg_parser)
args = arg_parser.parse_args()

flagperf_config = {}
sys.path.append(os.path.dirname(args.flagperf_config))
config_file = os.path.basename(args.flagperf_config).split('.')[0]
Expand All @@ -107,10 +107,12 @@ def get_metric(texts):
theoryflops = getattr(module, 'theoryflops')
epochs = getattr(module, 'epochs')
flashattn = getattr(module, 'flashattn')

gradient_checkpointing = getattr(module, 'gradient_checkpointing')

deepspeed.init_distributed()
model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"),
flashattn)
flashattn, gradient_checkpointing)

dataset = get_llama_dataset(args, seqlength, datafilename)

logger = logging.getLogger("DeepSpeed")
Expand Down Expand Up @@ -138,4 +140,7 @@ def get_metric(texts):
chip_tps = whole_tps / args.nproc * args.nnodes
print("System tokens per second: ", whole_tps)
print("Tokens/p/s: ", chip_tps)
TFLOPS = int(theoryflops/1000000000000)
print("Theory TFLOPS: ", TFLOPS)
print("Tokens/TFLOPS: ", chip_tps / TFLOPS)
print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
65 changes: 65 additions & 0 deletions training/iluvatar/docker_image/deepspeed/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
FROM ubuntu:20.04

# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
ADD sources.list /etc/apt/

RUN /bin/bash -c "source /root/.bashrc"

ENV DEBIAN_FRONTEND=noninteractive
ENV PATH /root/miniconda/bin:$PATH

RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
RUN apt-get update -y
RUN apt-get install -y --fix-missing \
apt-utils \
sudo \
openssh-server \
vim \
git \
curl \
wget \
tree \
perl \
kmod \
make \
pciutils \
build-essential \
python3.8-dev \
python3-pip \
libjpeg-dev \
zlib1g-dev \
unzip \
cmake \
bzip2 \
cabextract \
iputils-ping \
pbzip2 \
pv \
numactl \
ninja-build \
gcc-7 \
g++-7 \
libncursesw5


# Configure anaconda
RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
/root/miniconda/bin/conda clean -tipsy && \
ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc && \
conda config --set always_yes yes --set changeps1 no && \
echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc


RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"

RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7

RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"

ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
ENV PATH="/usr/local/corex/bin:${PATH}"
ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
28 changes: 28 additions & 0 deletions training/iluvatar/docker_image/deepspeed/deepspeed_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

SDK_DIR="/workspace/docker_image/sdk_installers"
PKG_DIR="/workspace/docker_image/packages"

search_cuda_results=`find ${SDK_DIR} -name "*cuda*10.2*.run"`
for installer in $search_cuda_results; do
echo "Install ${installer}"
sh "${installer}" -- --silent --toolkit
done

search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
for installer in $search_sdk_results; do
echo "Install ${installer}"
sh "${installer}" -- --silent --toolkit
done


torch_packages_results=`find ${PKG_DIR} -name "torch-*.whl"`
if [ -n "$torch_packages_results" ]; then
pip3 install "$torch_packages_results"
fi

search_packages_results=`find ${PKG_DIR} -name "*.whl"`
for pkg in $search_packages_results; do
echo "Install ${pkg}"
pip3 install "${pkg}"
done
16 changes: 16 additions & 0 deletions training/iluvatar/docker_image/deepspeed/packages/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# 以下软件包需联系天数智芯获取
# The iluvatar deepspeed backend is still in the development stage now.

>联系邮箱: [email protected]

apex-0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl

deepspeed-0.10.0+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl

flash_attn-2.0.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl

torch-1.13.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl

torchtext-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl

torchvision-0.14.1+corex.3.2.0.20231211.1602-cp38-cp38-linux_x86_64.whl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# 以下软件包需联系天数智芯获取
# The iluvatar deepspeed backend is still in the development stage now.

>联系邮箱: [email protected]

corex-installer-linux64-3.2.0.20231211.1602_x86_64_10.2.run

cuda_10.2.89_440.33.01_linux.run
51 changes: 51 additions & 0 deletions training/iluvatar/llama2_7b-deepspeed/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
### Iluvatar GPU配置与运行信息参考
#### 环境配置
- ##### 硬件环境
- 机器、加速卡型号: Iluvatar BI-V100 32GB


- ##### 软件环境
- OS版本:Ubuntu 20.04
- Docker 版本:20.10.21
- 训练框架版本:deepspeed 0.10.0
- 依赖软件版本:sentencepiece

- ##### 并行策略

- 并行技术:sharded data parallel
- 实施者:deepspeed ZeRO-DP
- 实施细节:ZeRO-DP O3

- ##### 优化策略

- flash attention 2
- checkpointing

### 运行情况

* 输入批尺寸
1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_BI-V100x1x8.py中所写,在本case中默认为3
2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_BI-V100x1x8.py中所写,在本case中默认为1024
3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为1
4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size,简写为GBS。在本case中,只存在数据并行,因此data_parallel_size=world_size。

* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| ------------ | -------------------------- | ---------------------------------- |
| 任务类别 | 自然语言理解 | |
| 模型 | llama2_7b | |
| 数据集 | openwebtext | 如无特殊说明,训练前1亿个token |
| 数据精度 | amp | |
| 超参修改 | fix_hp,见“性能指标” | 运行必要特殊超参,例如需要改小seqlength避免OOM |
| 硬件设备简称 | Iluvatar BI-V100 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 |
| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 |

* 性能指标

| 配置 | fix_hp | token/p/s | loss | mem | MFU |
| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
| BI-V100单机8卡(1x8) | MPE=2048 LBS=10 | / | 5.59 | 31/32 | / |
| BI-V100单机8卡(1x8) | MPE=4096 LBS=5 | / | 5.67 | 31/32 | / |
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
seqlength = 4096
batchsize = 5
datafilename = "openwebtext_llama2_100M.npy"
theoryflops = 64000000000000.0
epochs = 1
flashattn = True
gradient_checkpointing = True
use_cache = False
Original file line number Diff line number Diff line change
@@ -1,40 +1,50 @@
{
"gradient_accumulation_steps": 1,
"train_micro_batch_size_per_gpu": 1,
"prescale_gradients": false,
"zero_allow_untested_optimizer": true,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 1e-5,
"weight_decay": 0.1,
"betas": [
0.9,
0.95
],
"eps": 1e-5
}
},
"zero_optimization": {
"stage": 3,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_prefetch_bucket_size": 1e7,
"sub_group_size": 1e9,
"contiguous_gradients": true,
"allgather_bucket_size": 1e8,
"reduce_bucket_size": 1e7,
"overlap_comm": true,
"reduce_scatter": true
},
"steps_per_print": 50,
"gradient_clipping": 1.0,
"wall_clock_breakdown": false,
"bf16": {
"enabled": true
},
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": false
}
}
{
"gradient_accumulation_steps": 1,
"train_micro_batch_size_per_gpu": 1,
"prescale_gradients": false,
"zero_allow_untested_optimizer": true,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 1e-5,
"weight_decay": 0.1,
"betas": [
0.9,
0.95
],
"eps": 1e-5
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": false,
"contiguous_gradients": true,
"stage3_max_live_parameters": 5e7,
"stage3_max_reuse_distance": 5e7,
"stage3_prefetch_bucket_size": 1e7,
"sub_group_size": 8e7,
"allgather_bucket_size": 2e8,
"reduce_bucket_size": 2e8,
"reduce_scatter": true,
"stage3_gather_16bit_weights_on_model_save": false
},
"steps_per_print": 10,
"gradient_clipping": 1.0,
"wall_clock_breakdown": false,
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false,
"auto_cast": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 1,
"min_loss_scale": 1
},
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": false
}
}
2 changes: 2 additions & 0 deletions training/iluvatar/llama2_7b-deepspeed/config/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sentencepiece
transformers==4.34.1
6 changes: 4 additions & 2 deletions training/iluvatar/tacotron2-pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
| 数据集 | LJSpeech | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件设备简称 | Iluvatar BI-V100 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) |
Expand All @@ -40,5 +40,7 @@

| 配置 | precision| fix_hp | e2e_time | p_whole | p_train | p_core | val_loss | mem |
|--------------------| ---------| ----------------| ---------| ------- | ------- | ------ | -------- | --------- |
| BI100单机8卡(1x8) | tf32 | bs=64, lr=0.001 | 41220 | 33082 | 33289 | 33511 | 0.4833 | 18.4/32.0 |
| BI-V100单机8卡(1x8) | tf32 | bs=96, lr=0.001 | / | / | / | / | 0.4848 | 28/32.0 |
| BI-V100双机8卡(2x8) | tf32 | bs=96, lr=0.001 | / | / | / | / | / | 26.5/32.0 |
| BI-V100单机单卡(1x1) | tf32 | bs=96, lr=0.001 | / | / | / | / | / | 25.3/32.0 |

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from config_common import *

train_batch_size = 96
eval_batch_size = train_batch_size

warmup = 0.2
learning_rate = 1e-3

seed = 23333
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from config_common import *

train_batch_size = 64
train_batch_size = 96
eval_batch_size = train_batch_size

warmup = 0.2
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from config_common import *

train_batch_size = 96
eval_batch_size = train_batch_size

warmup = 0.2
learning_rate = 1e-3

seed = 23333
Loading