From 7ac42c1a5804226ef91c27514b0bb90655b96bc0 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 28 Sep 2023 15:20:40 +0800 Subject: [PATCH 1/2] Add kunlunxin mask-rcnn --- .../kunlunxin/mask_rcnn-pytorch/README.md | 48 +++++++++++++++++++ .../config/config_R300x1x1.py | 5 ++ .../config/config_R300x1x8.py | 4 ++ .../config/config_R300x2x8.py | 4 ++ .../config/environment_variables.sh | 7 +++ .../mask_rcnn-pytorch/extern/.gitkeep | 0 6 files changed, 68 insertions(+) create mode 100644 training/kunlunxin/mask_rcnn-pytorch/README.md create mode 100644 training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py create mode 100644 training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py create mode 100644 training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh create mode 100644 training/kunlunxin/mask_rcnn-pytorch/extern/.gitkeep diff --git a/training/kunlunxin/mask_rcnn-pytorch/README.md b/training/kunlunxin/mask_rcnn-pytorch/README.md new file mode 100644 index 000000000..b1299ef58 --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/README.md @@ -0,0 +1,48 @@ +### 模型Checkpoint下载 +[模型Checkpoint下载](../../benchmarks/mask_rcnn/README.md#模型checkpoint) +### 测试数据集下载 +[测试数据集下载](../../benchmarks/mask_rcnn/README.md#数据集下载地址) + +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir + - 训练编译器版本:xacc + - 依赖软件版本:pytorch-1.12.1+cpu + + +### 运行情况 +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 图像目标检测及语义分割 | | +| 模型 | mask_rcnn | | +| 数据集 | coco2017 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | map,见“性能指标” | 单位为平均目标检测正确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | map_bbox && map_segm | mem | +| ------------------- | --------- | ------------- | -------- | ------- | ------- | ------ | -------------------- | --------- | +| R300单机单卡(1x1) | fp32 | | | | | | | | +| R300单机8卡(1x8) | fp32 | bs=8,lr=0.16 | | | | | | 11.3/32.0 | +| R300两机8卡(2x8) | fp32 | | | | | | | | diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..9f3610c27 --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py @@ -0,0 +1,5 @@ +vendor: str = "kunlunxin" +train_batch_size = 8 +eval_batch_size = 8 +lr = 0.16 +distributed=False \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py new file mode 100644 index 000000000..40e072856 --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py @@ -0,0 +1,4 @@ +vendor: str = "kunlunxin" +train_batch_size = 8 +eval_batch_size = 8 +lr = 0.16 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..b4605e753 --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py @@ -0,0 +1,4 @@ +vendor: str = "kunlunxin" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.16 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh b/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..0d501a839 --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh @@ -0,0 +1,7 @@ +# ================================================= +# Export variables +# ================================================= + + +export XACC_ENABLE=1 +export BKCL_PCIE_RING=1 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/extern/.gitkeep b/training/kunlunxin/mask_rcnn-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb From 77d0893360532658f57edd503f776d74eb218a2e Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Oct 2023 10:13:24 +0800 Subject: [PATCH 2/2] Refine mask-rcnn --- training/kunlunxin/mask_rcnn-pytorch/README.md | 2 +- .../kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py | 6 ++++-- .../kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py | 3 +++ .../kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py | 7 +++++-- .../mask_rcnn-pytorch/config/environment_variables.sh | 4 +++- .../kunlunxin/mask_rcnn-pytorch/config/requirements.txt | 4 ++++ 6 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 training/kunlunxin/mask_rcnn-pytorch/config/requirements.txt diff --git a/training/kunlunxin/mask_rcnn-pytorch/README.md b/training/kunlunxin/mask_rcnn-pytorch/README.md index b1299ef58..afa45f0f2 100644 --- a/training/kunlunxin/mask_rcnn-pytorch/README.md +++ b/training/kunlunxin/mask_rcnn-pytorch/README.md @@ -44,5 +44,5 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | map_bbox && map_segm | mem | | ------------------- | --------- | ------------- | -------- | ------- | ------- | ------ | -------------------- | --------- | | R300单机单卡(1x1) | fp32 | | | | | | | | -| R300单机8卡(1x8) | fp32 | bs=8,lr=0.16 | | | | | | 11.3/32.0 | +| R300单机8卡(1x8) | fp32 | bs=8,lr=0.16 | | | | | 38.4 && 34.6 | 11.3/32.0 | | R300两机8卡(2x8) | fp32 | | | | | | | | diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py index 9f3610c27..d4a667fd4 100644 --- a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x1.py @@ -1,5 +1,7 @@ vendor: str = "kunlunxin" + +distributed = False + train_batch_size = 8 eval_batch_size = 8 -lr = 0.16 -distributed=False \ No newline at end of file +lr = 0.16 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py index 40e072856..9be93f5b8 100644 --- a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x1x8.py @@ -1,4 +1,7 @@ vendor: str = "kunlunxin" + +dist_backend = "xccl" + train_batch_size = 8 eval_batch_size = 8 lr = 0.16 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py index b4605e753..9be93f5b8 100644 --- a/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/mask_rcnn-pytorch/config/config_R300x2x8.py @@ -1,4 +1,7 @@ vendor: str = "kunlunxin" -train_batch_size = 16 -eval_batch_size = 16 + +dist_backend = "xccl" + +train_batch_size = 8 +eval_batch_size = 8 lr = 0.16 \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh b/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh index 0d501a839..79911276a 100644 --- a/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/mask_rcnn-pytorch/config/environment_variables.sh @@ -4,4 +4,6 @@ export XACC_ENABLE=1 -export BKCL_PCIE_RING=1 \ No newline at end of file +export BKCL_PCIE_RING=1 +export XMLIR_D_XPU_L3_SIZE=66060288 +export XDNN_CONV_GEMM_DTYPE="int16" \ No newline at end of file diff --git a/training/kunlunxin/mask_rcnn-pytorch/config/requirements.txt b/training/kunlunxin/mask_rcnn-pytorch/config/requirements.txt new file mode 100644 index 000000000..84b402e9e --- /dev/null +++ b/training/kunlunxin/mask_rcnn-pytorch/config/requirements.txt @@ -0,0 +1,4 @@ +https://download.pytorch.org/whl/cpu/torchvision-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl +pycocotools==2.0.5 +psutil==5.9.5 +numpy==1.23.5 \ No newline at end of file