From c3276eb49ede1b6922240b177583e36555a7415d Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 11 Jul 2024 12:08:07 -0700 Subject: [PATCH 1/5] Downgrade ucx and mpi versions --- build.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/build.py b/build.py index a996892b9f..d3b30b137c 100755 --- a/build.py +++ b/build.py @@ -1086,18 +1086,24 @@ def create_dockerfile_linux( # Remove contents that are not needed in runtime # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0 -RUN ldconfig && \ - ARCH="$(uname -i)" && \ - rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \ - rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \ - rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \ - python3 -m pip install --upgrade pip && \ - pip3 install --no-cache-dir transformers && \ - find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \ - find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \ +RUN ldconfig && \\ + ARCH="$(uname -i)" && \\ + rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\ + rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\ + rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\ + python3 -m pip install --upgrade pip && \\ + pip3 install --no-cache-dir transformers && \\ + find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\ + find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\ pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH + +# There are some ucc issues when spawning mpi processes with mpi 4.1.7a and +# ucx 1.16.0. Downgrade to mpi 4.1.5rc2 and ucx 1.15.0 to avoid the issue. +RUN rm -fr /opt/hpcx/ompi /opt/hpcx/ucx +COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi +COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ucx /opt/hpcx/ucx """ with open(os.path.join(ddir, dockerfile_name), "w") as dfile: dfile.write(df) From ba04514ce648459d7d4c033e47765e0f5480dc94 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 11 Jul 2024 13:57:39 -0700 Subject: [PATCH 2/5] Copy ucc --- build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build.py b/build.py index d3b30b137c..48205c881f 100755 --- a/build.py +++ b/build.py @@ -1101,9 +1101,10 @@ def create_dockerfile_linux( # There are some ucc issues when spawning mpi processes with mpi 4.1.7a and # ucx 1.16.0. Downgrade to mpi 4.1.5rc2 and ucx 1.15.0 to avoid the issue. -RUN rm -fr /opt/hpcx/ompi /opt/hpcx/ucx +RUN rm -fr /opt/hpcx/ompi /opt/hpcx/ucx /opt/hpcx/ucc COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ucx /opt/hpcx/ucx +COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ucc /opt/hpcx/ucc """ with open(os.path.join(ddir, dockerfile_name), "w") as dfile: dfile.write(df) From d6815f7483ef5f62fb9ee2c8988117f275bff2ae Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 11 Jul 2024 15:25:26 -0700 Subject: [PATCH 3/5] Only replace ompi folder --- build.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/build.py b/build.py index 48205c881f..48ab19402d 100755 --- a/build.py +++ b/build.py @@ -1101,10 +1101,8 @@ def create_dockerfile_linux( # There are some ucc issues when spawning mpi processes with mpi 4.1.7a and # ucx 1.16.0. Downgrade to mpi 4.1.5rc2 and ucx 1.15.0 to avoid the issue. -RUN rm -fr /opt/hpcx/ompi /opt/hpcx/ucx /opt/hpcx/ucc +RUN rm -fr /opt/hpcx/ompi COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi -COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ucx /opt/hpcx/ucx -COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ucc /opt/hpcx/ucc """ with open(os.path.join(ddir, dockerfile_name), "w") as dfile: dfile.write(df) From 18db9f33270e4080dcffacc22850f89cb9658562 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 11 Jul 2024 15:39:03 -0700 Subject: [PATCH 4/5] Update comment --- build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.py b/build.py index 48ab19402d..8ef1aafc1b 100755 --- a/build.py +++ b/build.py @@ -1099,8 +1099,8 @@ def create_dockerfile_linux( ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH -# There are some ucc issues when spawning mpi processes with mpi 4.1.7a and -# ucx 1.16.0. Downgrade to mpi 4.1.5rc2 and ucx 1.15.0 to avoid the issue. +# There are some ucc issues when spawning mpi processes with mpi 4.1.7a. +# Downgrade to mpi 4.1.5rc2 to avoid the issue. RUN rm -fr /opt/hpcx/ompi COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi """ From 2540384d6f76ec2d66ecd99f58ad99abf0039e18 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 11 Jul 2024 16:05:12 -0700 Subject: [PATCH 5/5] Update comments --- build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.py b/build.py index 8ef1aafc1b..9fe677aeab 100755 --- a/build.py +++ b/build.py @@ -1099,8 +1099,8 @@ def create_dockerfile_linux( ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH -# There are some ucc issues when spawning mpi processes with mpi 4.1.7a. -# Downgrade to mpi 4.1.5rc2 to avoid the issue. +# There are some ucc issues when spawning mpi processes with ompi v4.1.7a1. +# Downgrade to ompi v4.1.5rc2 to avoid the issue. RUN rm -fr /opt/hpcx/ompi COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi """