-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.sub
executable file
·243 lines (200 loc) · 8.86 KB
/
run.sub
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/bin/bash
#SBATCH -N 1 # number of nodes
#SBATCH -t 12:00:00 # wall time
#SBATCH -J image_classification # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --mail-type=FAIL # only send email on failure
#SBATCH --threads-per-core=2 # HT is on
#SBATCH --cores-per-socket=20 # 20 cores on each socket
#SBATCH --overcommit
DATESTAMP=${DATESTAMP:-`date +'%y%m%d%H%M%S%N'`}
## Data, container and volumes
BENCHMARK=${BENCHMARK:-"image_classification"}
BENCHMARK_NAME="RESNET"
CONT=${CONT:-"mlperf-nvidia:$BENCHMARK"}
DATADIR=${DATADIR:-"/raid/data/imagenet/train-val-recordio-passthrough"}
LOGDIR=${LOGDIR:-"/raid/results/$BENCHMARK"}
SEED=${SEED:-$(od -A n -t d -N 3 /dev/urandom)} # Allows passing SEED in, which is helpful if NEXP=1 ; for NEXP>1 we need to pick different seeds for subsequent runs
#CODEDIR=${CODEDIR:-$(readlink -f $(dirname ${BASH_SOURCE[0]}))} # SLURM normally copies these files to someplace in /var by default, so best to set this explicitly
#SRCDIR=${SRCDIR:-${CODEDIR}/${BENCHMARK}/mxnet}
DOCKER_MEMORY=${DOCKER_MEMORY:-""}
echo "---------------------------------------"
echo "MLBOX - DOCKER_MEMORY - '${DOCKER_MEMORY}'"
echo "---------------------------------------"
# FIXME: having the container running as non-root isn't yet tested
CONTAINER_UID=0
CONTAINER_GID=0
# MLPerf-specific stuff
NEXP=${NEXP:-5} # Default number of times to run the benchmark
CLEAR_CACHES=1
SYSLOGGING=1
LOG_CLEAR_CACHES="'from mlperf_compliance import constants, mlperf_log as log; import os; log.mlperf_print(key=constants.CACHE_CLEAR, value=True, benchmark=constants.${BENCHMARK_NAME}, stack_offset=0, root_dir=\"`pwd`\", extra_print=False)'"
SYS_LOG_GET="'import mlperf_compliance; from mlperf_log_utils import mlperf_submission_log; mlperf_submission_log(mlperf_compliance.constants.$BENCHMARK_NAME)'"
################################################################################
## DO NOT CHANGE ANYTHING BELOW -- DL params are in run_and_time.sh and config_<system>.sh files
################################################################################
## Load system-specific parameters for benchmark
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
if [[ ! -f "config_${DGXSYSTEM}.sh" ]]; then
echo "ERROR: Unknown system config ${DGXSYSTEM}"
exit 1
fi
source config_${DGXSYSTEM}.sh
export DGXSYSTEM
IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
## Check whether we are running in a slurm env
INSLURM=1
if [[ -z "$SLURM_JOB_ID" ]]; then
INSLURM=0
export SLURM_JOB_ID="${DATESTAMP}"
export SLURM_JOB_NUM_NODES=1
export OMPI_COMM_WORLD_LOCAL_RANK=0
fi
if [[ -z "$SLURM_NTASKS_PER_NODE" ]]; then
export SLURM_NTASKS_PER_NODE="${DGXNGPU}"
fi
# Create results directory
LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
mkdir -m 777 -p $(dirname "${LOGFILE_BASE}")
## Docker volume setup
export VOLS="-v $DATADIR:/data -v $LOGDIR:/results"
#for i in $(ls run.sub run_and_time.sh config_*.sh ompi_bind_*.sh *.py); do
# VOLS+=" -v $SRCDIR/$i:/workspace/$BENCHMARK/$i"
#done
export MLPERF_HOST_OS="$(cat /etc/issue | head -1 | cut -f1-3 -d" ") / $(cat /etc/dgx-release | grep -E "DGX_PRETTY_NAME|DGX_OTA_VERSION" |cut -f2 -d= |cut -f2 -d '"' |paste -sd' ')"
# Preset env vars to pass through 'docker run', 'docker exec', and 'mpirun'
VARS=(
-e "CONT=${CONT}"
-e "SEED"
-e "MLPERF_HOST_OS"
-e "DGXSYSTEM=${DGXSYSTEM}"
-e "NUMEPOCHS=${NUMEPOCHS}"
-e "SLURM_JOB_NUM_NODES=${SLURM_JOB_NUM_NODES}"
-e "SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE}"
-e "OMPI_MCA_mca_base_param_files=/dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf"
)
## Set container name to be used on all nodes
export CONTNAME="mpi_${SLURM_JOB_ID}"
## Check whether we're in a multi-node configuration
if [[ $SLURM_JOB_NUM_NODES -eq 1 ]]; then
hosts=( `hostname` )
IBDEVICES="" # don't need IB if not multi-node
SRUN=""
SRUNl=""
else
hosts=( `scontrol show hostname |tr "\n" " "` )
SRUN="srun --mem=0 -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1"
SRUNl="$SRUN -l"
fi
if [[ ! $KVSTORE =~ "horovod" ]]; then
MPIRUN=( )
else
[[ "$DEBUG" != "0" ]] && TAGOUTPUT="--tag-output"
MPIRUN=( mpirun --allow-run-as-root --bind-to none $TAGOUTPUT ${VARS[@]//-e/-x} --launch-agent "docker exec $CONTNAME orted" )
fi
## Pull image on all nodes by default
[[ "${PULL}" -ne "0" ]] && $SRUNl docker pull $CONT
##########################################
## Configure multinode
##########################################
if [[ $SLURM_JOB_NUM_NODES -gt 1 || $KVSTORE =~ "horovod" ]]; then
# systemd needs to be told not to wipe /dev/shm when the user context ends.
#$SRUNl grep -iR RemoveIPC /etc/systemd
# 1. Prepare run dir on all nodes
mkdir -p /dev/shm/mpi/${SLURM_JOB_ID}; chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}
# 2. Create mpi hostlist
rm -f /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
for hostn in ${hosts[@]}; do
echo "$hostn slots=${SLURM_NTASKS_PER_NODE}" >> /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
done
# 3. Create mpi config file
cat > /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf <<EOF
plm_rsh_agent = /usr/bin/ssh
plm_rsh_args = -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -oLogLevel=ERROR -oBatchMode=yes -l ${USER}
orte_default_hostfile = /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
btl_openib_warn_default_gid_prefix = 0
mpi_warn_on_fork = 0
EOF
# 4. Distribute config file, host list to all nodes
tar zcPf /dev/shm/mpi/${SLURM_JOB_ID}.tgz /dev/shm/mpi/${SLURM_JOB_ID}/
cat /dev/shm/mpi/${SLURM_JOB_ID}.tgz | $SRUNl tar zxPf -
rm -f /dev/shm/mpi/${SLURM_JOB_ID}.tgz
# 5. Grab SSH keys and configs on each node
$SRUNl cp -pr ~/.ssh /dev/shm/mpi/${SLURM_JOB_ID}/
fi
##########################################
## Start up container on each node
##########################################
DOCKERRUN_ARGS=(
--rm
--net=host
--uts=host
--ipc=host
--ulimit stack=67108864
--ulimit memlock=-1
--security-opt seccomp=unconfined
-u $CONTAINER_UID:$CONTAINER_GID
$IBDEVICES
)
echo
echo "Creating containers:"
$SRUNl bash -c "echo -n 'Launching on node ' && hostname"
$SRUNl docker run --init -d "${DOCKERRUN_ARGS[@]}" "${DOCKER_MEMORY}" --name $CONTNAME $VOLS "${VARS[@]}" $CONT bash -c 'rm -f /etc/shinit && sleep infinity' ; rv=$?
[[ $rv -ne 0 ]] && echo "ERR: Container launch failed." && exit $rv
sleep 30
docker logs $CONTNAME
if [[ $SLURM_JOB_NUM_NODES -gt 1 ]] || [[ $DGXSYSTEM =~ "_multi_" ]]; then
# For multinode, each container needs to be able to have access to the host system user's SSH keys & config
# FIXME: this probably doesn't work unless the UID/GID matches (untested) and/or the container is running as root (default)
$SRUNl docker exec "${VARS[@]}" $CONTNAME bash -c "cp -pr /dev/shm/mpi/${SLURM_JOB_ID}/.ssh ~/ ; chown -R $CONTAINER_UID:$CONTAINER_GID ~/.ssh/ ; chmod 700 ~/.ssh/"
fi
##########################################
## Launch app into running container(s)
##########################################
export SEED
rvs=0
for nrun in `seq 1 $NEXP`; do
(
echo "Beginning trial $nrun of $NEXP"
echo "Run vars: id $SLURM_JOB_ID"
if [[ $SYSLOGGING -eq 1 ]]; then
# Clear RAM cache dentries and inodes
VARS_STR="${VARS[@]}"
bash -c "echo -n 'Gathering sys log on ' && hostname && docker exec $VARS_STR $CONTNAME python -c ${SYS_LOG_GET}"
if [[ $? -ne 0 ]]; then
echo "ERR: Sys log gathering failed."
exit 1
fi
fi
if [[ $CLEAR_CACHES -eq 1 ]]; then
# Clear RAM cache dentries and inodes
if [[ $INSLURM -eq 1 ]]; then
VARS_STR="${VARS[@]}"
$SRUN bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3 && docker exec $VARS_STR $CONTNAME python -c ${LOG_CLEAR_CACHES}" \
|| echo "ERR: Cache clearing failed."
else
echo "Clearing cache on $(hostname)"
docker run --init --rm --privileged --entrypoint bash "${VARS[@]}" $CONT -c "sync && echo 3 > /proc/sys/vm/drop_caches && python -c ${LOG_CLEAR_CACHES} || exit 1" \
|| echo "ERR: Cache clearing failed."
fi
fi
# Launching app
echo
echo "Launching user script on master node:"
docker exec "${VARS[@]}" $CONTNAME "${MPIRUN[@]}" ./run_and_time.sh ; exit $?
) |& tee ${LOGFILE_BASE}_$nrun.log
rv=${PIPESTATUS[0]}
[[ $rv -ne 0 ]] && echo "ERR: User script failed (nrun=$nrun; rv=$rv)." && let rvs+=$rv
## SEED update
export SEED=$(od -A n -t d -N 3 /dev/urandom)
done
##########################################
## Clean up
##########################################
# Shut down containers and clean up /dev/shm
$SRUNl timeout -k 20 10 docker rm -f $CONTNAME >/dev/null
$SRUNl rm -rf /dev/shm/mpi/${SLURM_JOB_ID}/
# Finish
[[ $rvs -ne 0 ]] && echo "ERR: User script failed." && exit $rvs
exit 0