Update README and versions for 2.43.0 / 24.02 (#6886)

* Update README and versions for 2.43.0 / 24.02 * Update Dockefile to reduce image size. * Update path in patch file for model generation Update README.md post-24.02
triton-inference-server · Mar 2, 2024 · 3bc6863 · 3bc6863
1 parent ad25365
commit 3bc6863
Show file tree

Hide file tree

Showing 28 changed files with 106 additions and 101 deletions.
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.02-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_COMMON_REPO_TAG=main

diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
@@ -28,10 +28,55 @@
 
 ARG BASE_IMAGE=mcr.microsoft.com/windows:10.0.19042.1889
 
-FROM ${BASE_IMAGE}
+FROM ${BASE_IMAGE} as dependency_base
+
+RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine
+RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1')
+RUN choco install unzip -y
+
+#
+# Installing TensorRT
+#
+ARG TENSORRT_VERSION
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.0.zip"
+ARG TENSORRT_SOURCE=${TENSORRT_ZIP}
+# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
+ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
+RUN unzip /tmp/%TENSORRT_ZIP%
+RUN move TensorRT-* TensorRT
+
+LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
+
+
+#
+# Installing cuDNN
+#
+ARG CUDNN_VERSION
+ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
+ARG CUDNN_SOURCE=${CUDNN_ZIP}
+ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
+RUN unzip /tmp/%CUDNN_ZIP%
+RUN move cudnn-* cudnn
+
+LABEL CUDNN_VERSION="${CUDNN_VERSION}"
+
+
+FROM ${BASE_IMAGE} as build_base
 
 SHELL ["cmd", "/S", "/C"]
 
+ARG CUDNN_VERSION
+ENV CUDNN_VERSION ${CUDNN_VERSION}
+COPY --from=dependency_base /cudnn /cudnn
+RUN setx PATH "c:\cudnn\bin;c:\cudnn\lib\x64;c:\cudnn\include;%PATH%"
+LABEL CUDNN_VERSION="${CUDNN_VERSION}"
+
+ARG TENSORRT_VERSION
+ENV TRT_VERSION ${TENSORRT_VERSION}
+COPY --from=dependency_base /TensorRT /TensorRT
+RUN setx PATH "c:\TensorRT\lib;%PATH%"
+LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
+
 RUN mkdir c:\tmp
 WORKDIR /tmp
 
@@ -40,33 +85,30 @@ RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.Servi
 RUN choco install git docker unzip -y
 
 #
-# Installing CMake
+# Installing python
 #
-ARG CMAKE_VERSION=3.27.1
-ARG CMAKE_FILE=cmake-${CMAKE_VERSION}-windows-x86_64
-ARG CMAKE_SOURCE=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_FILE}.zip
+ARG PYTHON_VERSION=3.10.11
+ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe
+ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe
+RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%"
+RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe"
+RUN pip install --upgrade wheel setuptools docker
+RUN pip install grpcio-tools
 
-ADD ${CMAKE_SOURCE} ${CMAKE_FILE}.zip
-RUN unzip %CMAKE_FILE%.zip
-RUN move %CMAKE_FILE% "c:\CMake"
-RUN setx PATH "c:\CMake\bin;%PATH%"
+LABEL PYTHON_VERSION=${PYTHON_VERSION}
 
+#
+# Installing CMake
+#
+ARG CMAKE_VERSION=3.27.1
+RUN pip install cmake==%CMAKE_VERSION%
 ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
 ENV VCPKG_TARGET_TRIPLET x64-windows
-
 LABEL CMAKE_VERSION=${CMAKE_VERSION}
 
 # Be aware that pip can interact badly with VS cmd shell so need to pip install before
 # vsdevcmd.bat (see https://bugs.python.org/issue38989)
-ARG PYTHON_VERSION=3.8.10
-ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe
-ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe
-RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%"
-RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe"
-RUN pip install --upgrade wheel setuptools docker
-RUN pip install grpcio-tools
 
-LABEL PYTHON_VERSION=${PYTHON_VERSION}
 
 #
 # Installing Visual Studio BuildTools: VS17 2022
@@ -149,43 +191,6 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
 LABEL CUDA_VERSION="${CUDA_VERSION}"
-
-#
-# Installing TensorRT
-#
-ARG TENSORRT_VERSION=8.6.1.6
-ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.0.zip"
-ARG TENSORRT_SOURCE=${TENSORRT_ZIP}
-# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
-ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
-
-RUN unzip /tmp/%TENSORRT_ZIP%
-RUN move TensorRT-* TensorRT
-ENV TRT_VERSION ${TENSORRT_VERSION}
-
-RUN setx PATH "c:\TensorRT\lib;%PATH%"
-
-LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
-
-
-#
-# Installing cuDNN
-#
-ARG CUDNN_VERSION=8.9.7.29
-ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=${CUDNN_ZIP}
-
-ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
-
-RUN unzip /tmp/%CUDNN_ZIP%
-RUN move cudnn-* cudnn
-RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
-RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
-RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
-
-ENV CUDNN_VERSION ${CUDNN_VERSION}
-
-LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 # It is important that the entrypoint initialize VisualStudio
 # environment otherwise the build will fail. Also set
 # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@
 > [!WARNING]
 > ##### LATEST RELEASE
 > You are currently on the `main` branch which tracks under-development progress towards the next release.
-> The current release is version [2.42.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.01 container release on NVIDIA GPU Cloud (NGC).
+> The current release is version [2.43.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.02 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -91,16 +91,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.01 https://github.com/triton-inference-server/server.git
+git clone -b r24.02 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.01-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.02-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.01-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.02-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following

diff --git a/build.py b/build.py
@@ -72,7 +72,7 @@
 TRITON_VERSION_MAP = {
     "2.44.0dev": (
         "24.03dev",  # triton container
-        "24.01",  # upstream container
+        "24.02",  # upstream container
         "1.16.3",  # ORT
         "2023.3.0",  # ORT OpenVINO
         "2023.3.0",  # Standalone OpenVINO
@@ -1337,7 +1337,7 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 COPY --from=min_container /opt/hpcx/ucx/lib/libucs.so.0 /opt/hpcx/ucx/lib/libucs.so.0
 COPY --from=min_container /opt/hpcx/ucx/lib/libuct.so.0 /opt/hpcx/ucx/lib/libuct.so.0
 
-COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.8 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.8
+COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9
 
 # patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so
 RUN apt-get update && \

diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.01-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.02-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.42.0"
+appVersion: "2.43.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart

diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.01-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.02-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -46,13 +46,13 @@ image:
     # Model Control Mode (Optional, default: none)
     #
     # To set model control mode, uncomment and configure below
-    # See https://github.com/triton-inference-server/server/blob/r24.01/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.02/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.01/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.02/README.md
     #  for more details
 
 service:

diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.01-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.02-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.01-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.02-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:

diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -28,8 +28,8 @@
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
 export MAJOR_VERSION=2.41
-export MINOR_VERSION=2.42.0
-export NGC_VERSION=24.01-py3
+export MINOR_VERSION=2.43.0
+export NGC_VERSION=24.02-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 

diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -28,4 +28,4 @@ apiVersion: v1
 appVersion: "2.41"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.42.0
+version: 2.43.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -32,13 +32,13 @@ tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
 modelRepositoryPath: gs://triton_sample_models/23_12
-publishedVersion: '2.42.0'
+publishedVersion: '2.43.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.01-py3
+  tag: 24.02-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1

diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.42.0'
+  publishedVersion: '2.43.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.

diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.42.0'
+  publishedVersion: '2.43.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.

diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.01-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.02-py3
 
 pip install onnx six torch tf2onnx tensorflow
 

diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.01-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.02-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models

diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.01-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.02-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1

diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
@@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common:<container tag> --repo-tag=core:<container ta
 
 If you are building on a release branch then `<container tag>` will
 default to the branch name. For example, if you are building on the
-r24.01 branch, `<container tag>` will default to r24.01. If you are
+r24.02 branch, `<container tag>` will default to r24.02. If you are
 building on any other branch (including the *main* branch) then
 `<container tag>` will default to "main". Therefore, you typically do
 not need to provide `<container tag>` at all (nor the preceding
@@ -334,8 +334,8 @@ python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild
 If you are building on *main* branch then '<container tag>' will
 default to "main". If you are building on a release branch then
 '<container tag>' will default to the branch name. For example, if you
-are building on the r24.01 branch, '<container tag>' will default to
-r24.01. Therefore, you typically do not need to provide '<container
+are building on the r24.02 branch, '<container tag>' will default to
+r24.02. Therefore, you typically do not need to provide '<container
 tag>' at all (nor the preceding colon). You can use a different
 '<container tag>' for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called

diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
@@ -44,8 +44,8 @@ from source to get more exact customization.
 The `compose.py` script can be found in the [server repository](https://github.com/triton-inference-server/server).
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
-For example branch [r24.01](https://github.com/triton-inference-server/server/tree/r24.01)
-should be used to create a image based on the NGC 24.01 Triton release.
+For example branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02)
+should be used to create a image based on the NGC 24.02 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -76,19 +76,19 @@ For example, running
 ```
 python3 compose.py --backend tensorflow1 --repoagent checksum
 ```
-on branch [r24.01](https://github.com/triton-inference-server/server/tree/r24.01) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:24.01-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:24.01-py3`
+on branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.02-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.02-py3`
 
 Alternatively, users can specify the version of Triton container to pull from any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.01
+python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.02
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.01-py3-min --image full,nvcr.io/nvidia/tritonserver:24.01-py3
+python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.02-py3-min --image full,nvcr.io/nvidia/tritonserver:24.02-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified.