diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 3019114930..19d0afe5f7 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.11-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index dec972eaf3..9147d70718 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -37,9 +37,9 @@ RUN choco install unzip -y # # Installing TensorRT # -ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_VERSION=10.7.0.23 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip" -ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip +ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} RUN unzip /tmp/%TENSORRT_ZIP% @@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" # # Installing cuDNN # -ARG CUDNN_VERSION=9.4.0.58 +ARG CUDNN_VERSION=9.6.0.74 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip -ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip +ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda12-archive.zip ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} RUN unzip /tmp/%CUDNN_ZIP% RUN move cudnn-* cudnn @@ -75,20 +75,19 @@ RUN choco install git docker unzip -y # # Installing python # -ARG PYTHON_VERSION=3.10.11 +ARG PYTHON_VERSION=3.12.3 ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" RUN pip install --upgrade wheel setuptools docker -RUN pip install grpcio-tools psutil LABEL PYTHON_VERSION=${PYTHON_VERSION} # # Installing CMake # -ARG CMAKE_VERSION=3.30.0 +ARG CMAKE_VERSION=3.30.5 RUN pip install cmake==%CMAKE_VERSION% ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake @@ -101,14 +100,16 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION} # # Installing Visual Studio BuildTools: VS17 2022 # -ARG BUILDTOOLS_VERSION=17.10.35201.131 # Download collect.exe in case of an install failure. ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe" # Use the latest release channel. For more control, specify the location of an internal layout. # Download the Build Tools bootstrapper. # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe -ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe + +ARG BUILDTOOLS_VERSION=17.12.35506.116 +ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5536698c-711c-4834-876f-2817d31a2ef2/58894fc272e86d3c3a6d85bf3a1df1e5a0685be8b9ab65d9f3cc5c2a8c6921cc/vs_BuildTools.exe + ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended. ARG VS_INSTALL_PATH_WP="C:\BuildTools" @@ -149,12 +150,13 @@ WORKDIR / # Installing CUDA # ARG CUDA_MAJOR=12 -ARG CUDA_MINOR=5 -ARG CUDA_PATCH=1 +ARG CUDA_MINOR=6 +ARG CUDA_PATCH=3 ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH} ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \ cudart_${CUDA_MAJOR}.${CUDA_MINOR} \ nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + nvrtc_${CUDA_MAJOR}.${CUDA_MINOR} nvrtc_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ @@ -175,7 +177,10 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -ARG CUDNN_VERSION=9.4.0.58 +ENV CUDA_VERSION=${CUDA_VERSION} +LABEL CUDA_VERSION="${CUDA_VERSION}" + +ARG CUDNN_VERSION=9.6.0.74 ENV CUDNN_VERSION ${CUDNN_VERSION} COPY --from=dependency_base /cudnn /cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." @@ -183,13 +188,12 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." LABEL CUDNN_VERSION="${CUDNN_VERSION}" -ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_VERSION=10.7.0.23 ENV TRT_VERSION ${TENSORRT_VERSION} COPY --from=dependency_base /TensorRT /TensorRT RUN setx PATH "c:\TensorRT\lib;%PATH%" LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" -LABEL CUDA_VERSION="${CUDA_VERSION}" # It is important that the entrypoint initialize VisualStudio # environment otherwise the build will fail. Also set # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so diff --git a/README.md b/README.md index 79e572b97d..cad845449c 100644 --- a/README.md +++ b/README.md @@ -30,235 +30,5 @@ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) ->[!WARNING] ->You are currently on the `main` branch which tracks under-development progress ->towards the next release. The current release is version [2.52.0](https://github.com/triton-inference-server/server/releases/latest) ->and corresponds to the 24.11 container release on NVIDIA GPU Cloud (NGC). - -Triton Inference Server is an open source inference serving software that -streamlines AI inferencing. Triton enables teams to deploy any AI model from -multiple deep learning and machine learning frameworks, including TensorRT, -TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton -Inference Server supports inference across cloud, data center, edge and embedded -devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference -Server delivers optimized performance for many query types, including real time, -batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines -the development and deployment of production AI. - -Major features include: - -- [Supports multiple deep learning - frameworks](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton) -- [Supports multiple machine learning - frameworks](https://github.com/triton-inference-server/fil_backend) -- [Concurrent model - execution](docs/user_guide/architecture.md#concurrent-model-execution) -- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher) -- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and - [implicit state management](docs/user_guide/architecture.md#implicit-state-management) - for stateful models -- Provides [Backend API](https://github.com/triton-inference-server/backend) that - allows adding custom backends and pre/post processing operations -- Supports writing custom backends in python, a.k.a. - [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends) -- Model pipelines using - [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business - Logic Scripting - (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) -- [HTTP/REST and GRPC inference - protocols](docs/customization_guide/inference_protocols.md) based on the community - developed [KServe - protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) -- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and - [Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api) - allow Triton to link directly into your application for edge and other in-process use cases -- [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server - throughput, server latency, and more - -**New to Triton Inference Server?** Make use of -[these tutorials](https://github.com/triton-inference-server/tutorials) -to begin your Triton journey! - -Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and -stay current on the latest product updates, bug fixes, content, best practices, -and more. Need enterprise support? NVIDIA global support is available for Triton -Inference Server with the -[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). - -## Serve a Model in 3 Easy Steps - -```bash -# Step 1: Create the example model repository -git clone -b r24.11 https://github.com/triton-inference-server/server.git -cd server/docs/examples -./fetch_models.sh - -# Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.11-py3 tritonserver --model-repository=/models - -# Step 3: Sending an Inference Request -# In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.11-py3-sdk -/workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg - -# Inference should return the following -Image '/workspace/images/mug.jpg': - 15.346230 (504) = COFFEE MUG - 13.224326 (968) = CUP - 10.422965 (505) = COFFEEPOT -``` -Please read the [QuickStart](docs/getting_started/quickstart.md) guide for additional information -regarding this example. The quickstart guide also contains an example of how to launch Triton on [CPU-only systems](docs/getting_started/quickstart.md#run-on-cpu-only-system). New to Triton and wondering where to get started? Watch the [Getting Started video](https://youtu.be/NQDtfSi5QF4). - -## Examples and Tutorials - -Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/ai-enterprise-suite/trial/) -for free access to a set of hands-on labs with Triton Inference Server hosted on -NVIDIA infrastructure. - -Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM -are located in the -[NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples) -page on GitHub. The -[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server) -contains additional documentation, presentations, and examples. - -## Documentation - -### Build and Deploy - -The recommended way to build and use Triton Inference Server is with Docker -images. - -- [Install Triton Inference Server with Docker containers](docs/customization_guide/build.md#building-with-docker) (*Recommended*) -- [Install Triton Inference Server without Docker containers](docs/customization_guide/build.md#building-without-docker) -- [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md) -- [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms) -- [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10) -- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md), - [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md) -- [Secure Deployment Considerations](docs/customization_guide/deploy.md) - -### Using Triton - -#### Preparing Models for Triton Inference Server - -The first step in using Triton to serve your models is to place one or -more models into a [model repository](docs/user_guide/model_repository.md). Depending on -the type of the model and on what Triton capabilities you want to enable for -the model, you may need to create a [model -configuration](docs/user_guide/model_configuration.md) for the model. - -- [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md) -- Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models) - and [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) -- Optimize your models setting [scheduling and batching](docs/user_guide/architecture.md#models-and-schedulers) - parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups). -- Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer) - to help optimize your model configuration with profiling -- Learn how to [explicitly manage what models are available by loading and - unloading models](docs/user_guide/model_management.md) - -#### Configure and Use Triton Inference Server - -- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference - Server on both GPU and CPU -- Triton supports multiple execution engines, called - [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend), - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), - [PyTorch](https://github.com/triton-inference-server/pytorch_backend), - [ONNX](https://github.com/triton-inference-server/onnxruntime_backend), - [OpenVINO](https://github.com/triton-inference-server/openvino_backend), - [Python](https://github.com/triton-inference-server/python_backend), and more -- Not all the above backends are supported on every platform supported by Triton. - Look at the - [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md) - to learn which backends are supported on your target platform. -- Learn how to [optimize performance](docs/user_guide/optimization.md) using the - [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) - and - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) -- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in - Triton -- Send requests directly to Triton with the [HTTP/REST JSON-based - or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols) - -#### Client Support and Examples - -A Triton *client* application sends inference and other requests to Triton. The -[Python and C++ client libraries](https://github.com/triton-inference-server/client) -provide APIs to simplify this communication. - -- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples), - [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples), - and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples) -- Configure [HTTP](https://github.com/triton-inference-server/client#http-options) - and [gRPC](https://github.com/triton-inference-server/client#grpc-options) - client options -- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP - request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request) - -### Extend Triton - -[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically -designed for modularity and flexibility - -- [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case -- [Create custom backends](https://github.com/triton-inference-server/backend) - in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api) - or [Python](https://github.com/triton-inference-server/python_backend) -- Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send - multiple responses for a request or not send any responses for a request -- Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality - that operates when a model is loaded and unloaded, such as authentication, - decryption, or conversion -- Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md) -- [Use Triton on AWS - Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia) - -### Additional Documentation - -- [FAQ](docs/user_guide/faq.md) -- [User Guide](docs/README.md#user-guide) -- [Customization Guide](docs/README.md#customization-guide) -- [Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html) -- [GPU, Driver, and CUDA Support -Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html) - -## Contributing - -Contributions to Triton Inference Server are more than welcome. To -contribute please review the [contribution -guidelines](CONTRIBUTING.md). If you have a backend, client, -example or similar contribution that is not modifying the core of -Triton, then you should file a PR in the [contrib -repo](https://github.com/triton-inference-server/contrib). - -## Reporting problems, asking questions - -We appreciate any feedback, questions or bug reporting regarding this project. -When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues), -follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve). -Ensure posted examples are: -- minimal – use as little code as possible that still produces the - same problem -- complete – provide all parts needed to reproduce the problem. Check - if you can strip external dependencies and still show the problem. The - less time we spend on reproducing problems the more time we have to - fix it -- verifiable – test the code you're about to provide to make sure it - reproduces the problem. Remove all other problems that are not - related to your request/question. - -For issues, please use the provided bug report and feature request templates. - -For questions, we recommend posting in our community -[GitHub Discussions.](https://github.com/triton-inference-server/server/discussions) - -## For more information - -Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server) -for more information. +> [!WARNING] +> You are currently on the `24.12` branch which tracks under-development and unreleased features. \ No newline at end of file diff --git a/TRITON_VERSION b/TRITON_VERSION index 7eb4ffb28a..261d95596f 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.53.0dev +2.53.0 diff --git a/build.py b/build.py index ae77cccb86..509b2c4eaa 100755 --- a/build.py +++ b/build.py @@ -71,10 +71,10 @@ # DEFAULT_TRITON_VERSION_MAP = { - "release_version": "2.53.0dev", - "triton_container_version": "24.12dev", - "upstream_container_version": "24.11", - "ort_version": "1.19.2", + "release_version": "2.53.0", + "triton_container_version": "24.12", + "upstream_container_version": "24.12", + "ort_version": "1.20.1", "ort_openvino_version": "2024.4.0", "standalone_openvino_version": "2024.4.0", "dcgm_version": "3.3.6", diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 4fcdd14bdb..be118becce 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 4e3c87c387..aac221acd0 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.51.0" +appVersion: "2.53.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index ff5513c7d7..30b1c331d1 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r24.11/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.12/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.11/README.md + # see https://github.com/triton-inference-server/server/blob/r24.12/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index f79cb75134..7a27c61efa 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index c27a327e2f..6712d7d381 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.11-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.12-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index a0c9762865..8c26ee5ed0 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -27,9 +27,9 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver -export MAJOR_VERSION=2.51 -export MINOR_VERSION=2.51.0 -export NGC_VERSION=24.11-py3 +export MAJOR_VERSION=2.53 +export MINOR_VERSION=2.53.0 +export NGC_VERSION=24.12-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index 027deb1d2f..7ad8ba851b 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. apiVersion: v1 -appVersion: "2.51" +appVersion: "2.53" description: Triton Inference Server name: triton-inference-server -version: 2.51.0 +version: 2.53.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index dfb992a543..673ec6acb3 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/24.11 -publishedVersion: '2.51.0' +modelRepositoryPath: gs://triton_sample_models/24.12 +publishedVersion: '2.53.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.11-py3 + tag: 24.12-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index be46874dba..eefb209efb 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.51.0' + publishedVersion: '2.53.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 699fa04a68..1defe7ca42 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.51.0' + publishedVersion: '2.53.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/24.11 + default: gs://triton_sample_models/24.12 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index 6a16fc9523..bdf655b2b0 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.11-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.12-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.11/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.12/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.11/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.12/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 77f1b47c5b..8e2fdcda6d 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index 1a62e52e7a..716ac24400 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 0622414609..fcb4ce14e9 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.11 branch, `` will default to r24.11. If you are +r24.12 branch, `` will default to r24.12. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then `` will default to "main". If you are building on a release branch then `` will default to the branch name. For example, if you -are building on the r24.11 branch, `` will default to -r24.11. Therefore, you typically do not need to provide `` will default to +r24.12. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different `` for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index 8bddd46aeb..9f20a05347 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -46,8 +46,8 @@ The `compose.py` script can be found in the Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. For example branch - [r24.11](https://github.com/triton-inference-server/server/tree/r24.11) -should be used to create a image based on the NGC 24.11 Triton release. + [r24.12](https://github.com/triton-inference-server/server/tree/r24.12) +should be used to create a image based on the NGC 24.12 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -79,20 +79,20 @@ For example, running ``` python3 compose.py --backend pytorch --repoagent checksum ``` -on branch [r24.11](https://github.com/triton-inference-server/server/tree/r24.11) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.11-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.11-py3` +on branch [r24.12](https://github.com/triton-inference-server/server/tree/r24.12) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.12-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.12-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend pytorch --repoagent checksum --container-version 24.11 +python3 compose.py --backend pytorch --repoagent checksum --container-version 24.12 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.11-py3-min --image full,nvcr.io/nvidia/tritonserver:24.11-py3 +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.12-py3-min --image full,nvcr.io/nvidia/tritonserver:24.12-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. @@ -103,8 +103,8 @@ Note: 2. vLLM and TensorRT-LLM backends are currently not supported backends for `compose.py`. If you want to build additional backends on top of these backends, it would be better to [build it yourself](#build-it-yourself) by using -`nvcr.io/nvidia/tritonserver:24.11-vllm-python-py3` or -`nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3` as a `min` container. +`nvcr.io/nvidia/tritonserver:24.12-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:24.12-trtllm-python-py3` as a `min` container. ### CPU-only container composition diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index 39891b3177..a85a10f48b 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp/\/qa_* -(for example /tmp/24.11/qa_model_repository). The TensorRT models +(for example /tmp/24.12/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/generate_docs.py b/docs/generate_docs.py index 9bc3fd0878..fde15402c6 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -43,11 +43,11 @@ """ TODO: Needs to handle cross-branch linkage. -For example, server/docs/user_guide/architecture.md on branch 24.11 links to +For example, server/docs/user_guide/architecture.md on branch 24.12 links to server/docs/user_guide/model_analyzer.md on main branch. In this case, the hyperlink of model_analyzer.md should be a URL instead of relative path. -Another example can be server/docs/user_guide/model_analyzer.md on branch 24.11 +Another example can be server/docs/user_guide/model_analyzer.md on branch 24.12 links to a file in server repo with relative path. Currently all URLs are hardcoded to main branch. We need to make sure that the URL actually points to the correct branch. We also need to handle cases like deprecated or removed files from diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 3787a89a60..faf66de25c 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the TensorRT +the 24.12 version of Triton, use the 24.12 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the TensorFlow +the 24.12 version of Triton, use the 24.12 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the PyTorch +the 24.12 version of Triton, use the 24.12 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index ff21175bbe..4d1f067662 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.11-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.12-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.11-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.12-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index b44631ae3a..4e23a9347b 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index 9898283737..fa7a8a18a1 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index 40052788ac..f4cdc2b6a2 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.16.1