Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to docker volumes in model generation #7910

Merged
merged 26 commits into from
Jan 2, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Progess fixation. Update script scenario.
  • Loading branch information
mc-nv committed Dec 30, 2024
commit 0117e9add512451e0cd24fc194778e3bd773f6c2
22 changes: 11 additions & 11 deletions qa/common/gen_qa_custom_ops_volume
Original file line number Diff line number Diff line change
Expand Up @@ -99,25 +99,25 @@ TF_LFLAGS=\$(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get
# No CUDA
cd /tmp
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc .
g++ -std=${STD_FLAG} -O2 -shared -fPIC zero_out_op_kernel_1.cc -o $DESTDIR/libzeroout.so \${TF_CFLAGS[@]} \${TF_LFLAGS[@]}
g++ -std=${STD_FLAG} -O2 -shared -fPIC zero_out_op_kernel_1.cc -o \$DESTDIR/libzeroout.so \${TF_CFLAGS[@]} \${TF_LFLAGS[@]}

# CUDA. Need to patch so that we can build it outside of bazel/TF
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/cuda_op_kernel.cc .
cp /opt/tensorflow/tensorflow-source/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc .
patch -i $VOLUME_SRCDIR/cuda_op_kernel.cu.cc.patch cuda_op_kernel.cu.cc
nvcc --expt-relaxed-constexpr -std=${STD_FLAG} -O2 -c -arch=all -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
g++ -std=${STD_FLAG} -shared -o $DESTDIR/libcudaop.so cuda_op_kernel.cc cuda_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}
g++ -std=${STD_FLAG} -shared -o \$DESTDIR/libcudaop.so cuda_op_kernel.cc cuda_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}

cp $VOLUME_SRCDIR/busy_op_kernel.cc .
cp $VOLUME_SRCDIR/busy_op_kernel.cu.cc .
nvcc --expt-relaxed-constexpr -std=${STD_FLAG} -O2 -c -arch=all -o busy_op_kernel.cu.o busy_op_kernel.cu.cc \${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
g++ -std=${STD_FLAG} -shared -o $DESTDIR/libbusyop.so busy_op_kernel.cc busy_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}
g++ -std=${STD_FLAG} -shared -o \$DESTDIR/libbusyop.so busy_op_kernel.cc busy_op_kernel.cu.o \${TF_CFLAGS[@]} -fPIC -L/usr/local/cuda/lib64 -lcudart \${TF_LFLAGS[@]}

python3 $VOLUME_SRCDIR/gen_qa_custom_ops_models.py --graphdef --savedmodel \
--models_dir=$DESTDIR --zero_out_lib_path=$DESTDIR/libzeroout.so \
--cuda_op_lib_path=$DESTDIR/libcudaop.so \
--busy_op_lib_path=$DESTDIR/libbusyop.so
chmod -R 777 $DESTDIR
--models_dir=\$DESTDIR --zero_out_lib_path=\$DESTDIR/libzeroout.so \
--cuda_op_lib_path=\$DESTDIR/libcudaop.so \
--busy_op_lib_path=\$DESTDIR/libbusyop.so
chmod -R 777 \$DESTDIR
EOF

chmod a+x $TFSCRIPT
Expand Down Expand Up @@ -146,9 +146,9 @@ nvidia-smi -L || true
nvidia-smi || true
set -e
export TORCH_EXTENSIONS_DIR="/root/.cache/torch_extensions/"
python3 $VOLUME_SRCDIR/gen_qa_custom_ops_models.py --libtorch --models_dir=$DESTDIR
cp \${TORCH_EXTENSIONS_DIR}/custom_modulo/custom_modulo.so $DESTDIR/libtorch_modulo/.
chmod -R 777 $DESTDIR
python3 $VOLUME_SRCDIR/gen_qa_custom_ops_models.py --libtorch --models_dir=\$DESTDIR
cp \${TORCH_EXTENSIONS_DIR}/custom_modulo/custom_modulo.so \$DESTDIR/libtorch_modulo/.
chmod -R 777 \$DESTDIR
EOF

chmod a+x $PYTSCRIPT
Expand All @@ -160,7 +160,7 @@ fi
docker cp $PYTSCRIPT $DOCKER_VOLUME:$VOLUME_SRCDIR/$PYTSCRIPT

docker pull $PYTORCH_IMAGE
docker run $DOCKER_GPU_ARGS --rm -v $DOCKER_VOLUME:/mnt -e $VOLUME_DESTDIR/libtorch_custom_ops $PYTORCH_IMAGE bash -xe $VOLUME_SRCDIR/$PYTSCRIPT
docker run $DOCKER_GPU_ARGS --rm -v $DOCKER_VOLUME:/mnt -e DESTDIR=$VOLUME_DESTDIR/libtorch_custom_ops $PYTORCH_IMAGE bash -xe $VOLUME_SRCDIR/$PYTSCRIPT

if [ $? -ne 0 ]; then
echo -e "Failed"
Expand Down