-
Notifications
You must be signed in to change notification settings - Fork 150
/
Copy pathDockerfile.ubuntu
124 lines (109 loc) · 3.38 KB
/
Dockerfile.ubuntu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
## base docker image
ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04
ARG ROCM_IMAGE_TAG=latest
FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}"
## rccl repo
ARG RCCL_REPO=https://github.com/ROCm/rccl
ARG RCCL_BRANCH=develop
## rccl-tests repo
ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests
ARG RCCL_TESTS_BRANCH=develop
## AMD GPU Targets
ARG GPU_TARGETS=gfx942
## creating scratch space
ENV WORKDIR /workspace
RUN mkdir -p ${WORKDIR}
WORKDIR ${WORKDIR}
## install dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
git \
make \
rocm-cmake \
ninja-build \
gfortran \
build-essential \
libomp5 \
libomp-dev \
libbfd-dev \
libboost-all-dev \
libnuma1 \
libnuma-dev \
libpthread-stubs0-dev \
libzstd-dev \
lcov \
zip \
zlib1g-dev \
wget \
pkg-config \
unzip \
chrpath \
doxygen \
lshw \
build-essential \
libssl-dev \
curl \
libncursesw5-dev \
xz-utils \
liblzma-dev \
python3-pip \
python3-setuptools \
python3-venv \
python3-dev \
python3-tk \
python3-yaml \
vim \
less \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \
&& chmod +x cmake-3.28.0-linux-x86_64.sh \
&& bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \
&& rm cmake-3.28.0-linux-x86_64.sh
## Set ROCm path
ENV ROCM_PATH=/opt/rocm
## Install UCX
ENV UCX_INSTALL_PREFIX=/opt/ucx
RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \
&& mkdir -p ucx \
&& tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \
&& cd ucx \
&& mkdir build \
&& cd build \
&& ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \
&& make -j16 install \
&& cd ../.. \
&& rm -rf ucx ucx-1.16.0.tar.gz
## Install OpenMPI
ENV MPI_INSTALL_PREFIX=/opt/ompi
RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \
&& mkdir -p ompi4 \
&& tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \
&& cd ompi4 \
&& mkdir build \
&& cd build \
&& ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \
&& make -j16 install \
&& cd ../.. \
&& rm -rf ompi4 openmpi-4.1.6.tar.gz
## building RCCL
ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl_develop/build/release
RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl_develop \
&& cd ./rccl_develop \
&& ./install.sh --amdgpu_targets=${GPU_TARGETS}
## building RCCL-Tests
RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \
&& cd ./rccl-tests \
&& mkdir build \
&& cd build \
&& CXX=${ROCM_PATH}/bin/amdclang++ MPI_HOME=${MPI_INSTALL_PREFIX} cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DAMDGPU_TARGETS=${GPU_TARGETS} .. \
&& make -j16
## set environment variables
ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}"
ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"
ENV UCX_WARN_UNUSED_ENV_VARS=n
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV NCCL_DEBUG=VERSION