diff --git a/Dockerfile.sdk b/Dockerfile.sdk index d30a0ac5ff..e1dd354889 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.02-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_COMMON_REPO_TAG=main diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index dfa3706280..27d7f7c00f 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -28,10 +28,55 @@ ARG BASE_IMAGE=mcr.microsoft.com/windows:10.0.19042.1889 -FROM ${BASE_IMAGE} +FROM ${BASE_IMAGE} as dependency_base + +RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine +RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1') +RUN choco install unzip -y + +# +# Installing TensorRT +# +ARG TENSORRT_VERSION +ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.0.zip" +ARG TENSORRT_SOURCE=${TENSORRT_ZIP} +# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} +ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} +RUN unzip /tmp/%TENSORRT_ZIP% +RUN move TensorRT-* TensorRT + +LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" + + +# +# Installing cuDNN +# +ARG CUDNN_VERSION +ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip +ARG CUDNN_SOURCE=${CUDNN_ZIP} +ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} +RUN unzip /tmp/%CUDNN_ZIP% +RUN move cudnn-* cudnn + +LABEL CUDNN_VERSION="${CUDNN_VERSION}" + + +FROM ${BASE_IMAGE} as build_base SHELL ["cmd", "/S", "/C"] +ARG CUDNN_VERSION +ENV CUDNN_VERSION ${CUDNN_VERSION} +COPY --from=dependency_base /cudnn /cudnn +RUN setx PATH "c:\cudnn\bin;c:\cudnn\lib\x64;c:\cudnn\include;%PATH%" +LABEL CUDNN_VERSION="${CUDNN_VERSION}" + +ARG TENSORRT_VERSION +ENV TRT_VERSION ${TENSORRT_VERSION} +COPY --from=dependency_base /TensorRT /TensorRT +RUN setx PATH "c:\TensorRT\lib;%PATH%" +LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" + RUN mkdir c:\tmp WORKDIR /tmp @@ -40,33 +85,30 @@ RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.Servi RUN choco install git docker unzip -y # -# Installing CMake +# Installing python # -ARG CMAKE_VERSION=3.27.1 -ARG CMAKE_FILE=cmake-${CMAKE_VERSION}-windows-x86_64 -ARG CMAKE_SOURCE=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_FILE}.zip +ARG PYTHON_VERSION=3.10.11 +ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe +ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe +RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" +RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" +RUN pip install --upgrade wheel setuptools docker +RUN pip install grpcio-tools -ADD ${CMAKE_SOURCE} ${CMAKE_FILE}.zip -RUN unzip %CMAKE_FILE%.zip -RUN move %CMAKE_FILE% "c:\CMake" -RUN setx PATH "c:\CMake\bin;%PATH%" +LABEL PYTHON_VERSION=${PYTHON_VERSION} +# +# Installing CMake +# +ARG CMAKE_VERSION=3.27.1 +RUN pip install cmake==%CMAKE_VERSION% ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake ENV VCPKG_TARGET_TRIPLET x64-windows - LABEL CMAKE_VERSION=${CMAKE_VERSION} # Be aware that pip can interact badly with VS cmd shell so need to pip install before # vsdevcmd.bat (see https://bugs.python.org/issue38989) -ARG PYTHON_VERSION=3.8.10 -ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe -ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe -RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" -RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" -RUN pip install --upgrade wheel setuptools docker -RUN pip install grpcio-tools -LABEL PYTHON_VERSION=${PYTHON_VERSION} # # Installing Visual Studio BuildTools: VS17 2022 @@ -149,43 +191,6 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" LABEL CUDA_VERSION="${CUDA_VERSION}" - -# -# Installing TensorRT -# -ARG TENSORRT_VERSION=8.6.1.6 -ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.0.zip" -ARG TENSORRT_SOURCE=${TENSORRT_ZIP} -# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} -ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} - -RUN unzip /tmp/%TENSORRT_ZIP% -RUN move TensorRT-* TensorRT -ENV TRT_VERSION ${TENSORRT_VERSION} - -RUN setx PATH "c:\TensorRT\lib;%PATH%" - -LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" - - -# -# Installing cuDNN -# -ARG CUDNN_VERSION=8.9.7.29 -ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip -ARG CUDNN_SOURCE=${CUDNN_ZIP} - -ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} - -RUN unzip /tmp/%CUDNN_ZIP% -RUN move cudnn-* cudnn -RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." -RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." -RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." - -ENV CUDNN_VERSION ${CUDNN_VERSION} - -LABEL CUDNN_VERSION="${CUDNN_VERSION}" # It is important that the entrypoint initialize VisualStudio # environment otherwise the build will fail. Also set # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so diff --git a/README.md b/README.md index 48e7157a04..ea95c8534f 100644 --- a/README.md +++ b/README.md @@ -31,234 +31,4 @@ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) > [!WARNING] -> ##### LATEST RELEASE -> You are currently on the `main` branch which tracks under-development progress towards the next release. -> The current release is version [2.42.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.01 container release on NVIDIA GPU Cloud (NGC). - -Triton Inference Server is an open source inference serving software that -streamlines AI inferencing. Triton enables teams to deploy any AI model from -multiple deep learning and machine learning frameworks, including TensorRT, -TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton -Inference Server supports inference across cloud, data center, edge and embedded -devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference -Server delivers optimized performance for many query types, including real time, -batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines -the development and deployment of production AI. - -Major features include: - -- [Supports multiple deep learning - frameworks](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton) -- [Supports multiple machine learning - frameworks](https://github.com/triton-inference-server/fil_backend) -- [Concurrent model - execution](docs/user_guide/architecture.md#concurrent-model-execution) -- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher) -- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and - [implicit state management](docs/user_guide/architecture.md#implicit-state-management) - for stateful models -- Provides [Backend API](https://github.com/triton-inference-server/backend) that - allows adding custom backends and pre/post processing operations -- Supports writing custom backends in python, a.k.a. - [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends) -- Model pipelines using - [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business - Logic Scripting - (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) -- [HTTP/REST and GRPC inference - protocols](docs/customization_guide/inference_protocols.md) based on the community - developed [KServe - protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) -- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and - [Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api) - allow Triton to link directly into your application for edge and other in-process use cases -- [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server - throughput, server latency, and more - -**New to Triton Inference Server?** Make use of -[these tutorials](https://github.com/triton-inference-server/tutorials) -to begin your Triton journey! - -Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and -stay current on the latest product updates, bug fixes, content, best practices, -and more. Need enterprise support? NVIDIA global support is available for Triton -Inference Server with the -[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). - -## Serve a Model in 3 Easy Steps - -```bash -# Step 1: Create the example model repository -git clone -b r24.01 https://github.com/triton-inference-server/server.git -cd server/docs/examples -./fetch_models.sh - -# Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.01-py3 tritonserver --model-repository=/models - -# Step 3: Sending an Inference Request -# In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.01-py3-sdk -/workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg - -# Inference should return the following -Image '/workspace/images/mug.jpg': - 15.346230 (504) = COFFEE MUG - 13.224326 (968) = CUP - 10.422965 (505) = COFFEEPOT -``` -Please read the [QuickStart](docs/getting_started/quickstart.md) guide for additional information -regarding this example. The quickstart guide also contains an example of how to launch Triton on [CPU-only systems](docs/getting_started/quickstart.md#run-on-cpu-only-system). New to Triton and wondering where to get started? Watch the [Getting Started video](https://youtu.be/NQDtfSi5QF4). - -## Examples and Tutorials - -Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/ai-enterprise-suite/trial/) -for free access to a set of hands-on labs with Triton Inference Server hosted on -NVIDIA infrastructure. - -Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM -are located in the -[NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples) -page on GitHub. The -[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server) -contains additional documentation, presentations, and examples. - -## Documentation - -### Build and Deploy - -The recommended way to build and use Triton Inference Server is with Docker -images. - -- [Install Triton Inference Server with Docker containers](docs/customization_guide/build.md#building-with-docker) (*Recommended*) -- [Install Triton Inference Server without Docker containers](docs/customization_guide/build.md#building-without-docker) -- [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md) -- [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms) -- [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10) -- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md), - [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md) -- [Secure Deployment Considerations](docs/customization_guide/deploy.md) - -### Using Triton - -#### Preparing Models for Triton Inference Server - -The first step in using Triton to serve your models is to place one or -more models into a [model repository](docs/user_guide/model_repository.md). Depending on -the type of the model and on what Triton capabilities you want to enable for -the model, you may need to create a [model -configuration](docs/user_guide/model_configuration.md) for the model. - -- [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md) -- Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models) - and [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) -- Optimize your models setting [scheduling and batching](docs/user_guide/architecture.md#models-and-schedulers) - parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups). -- Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer) - to help optimize your model configuration with profiling -- Learn how to [explicitly manage what models are available by loading and - unloading models](docs/user_guide/model_management.md) - -#### Configure and Use Triton Inference Server - -- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference - Server on both GPU and CPU -- Triton supports multiple execution engines, called - [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend), - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), - [PyTorch](https://github.com/triton-inference-server/pytorch_backend), - [ONNX](https://github.com/triton-inference-server/onnxruntime_backend), - [OpenVINO](https://github.com/triton-inference-server/openvino_backend), - [Python](https://github.com/triton-inference-server/python_backend), and more -- Not all the above backends are supported on every platform supported by Triton. - Look at the - [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md) - to learn which backends are supported on your target platform. -- Learn how to [optimize performance](docs/user_guide/optimization.md) using the - [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) - and - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) -- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in - Triton -- Send requests directly to Triton with the [HTTP/REST JSON-based - or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols) - -#### Client Support and Examples - -A Triton *client* application sends inference and other requests to Triton. The -[Python and C++ client libraries](https://github.com/triton-inference-server/client) -provide APIs to simplify this communication. - -- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples), - [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples), - and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples) -- Configure [HTTP](https://github.com/triton-inference-server/client#http-options) - and [gRPC](https://github.com/triton-inference-server/client#grpc-options) - client options -- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP - request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request) - -### Extend Triton - -[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically -designed for modularity and flexibility - -- [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case -- [Create custom backends](https://github.com/triton-inference-server/backend) - in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api) - or [Python](https://github.com/triton-inference-server/python_backend) -- Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send - multiple responses for a request or not send any responses for a request -- Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality - that operates when a model is loaded and unloaded, such as authentication, - decryption, or conversion -- Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md) -- [Use Triton on AWS - Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia) - -### Additional Documentation - -- [FAQ](docs/user_guide/faq.md) -- [User Guide](docs/README.md#user-guide) -- [Customization Guide](docs/README.md#customization-guide) -- [Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html) -- [GPU, Driver, and CUDA Support -Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html) - -## Contributing - -Contributions to Triton Inference Server are more than welcome. To -contribute please review the [contribution -guidelines](CONTRIBUTING.md). If you have a backend, client, -example or similar contribution that is not modifying the core of -Triton, then you should file a PR in the [contrib -repo](https://github.com/triton-inference-server/contrib). - -## Reporting problems, asking questions - -We appreciate any feedback, questions or bug reporting regarding this project. -When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues), -follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve). -Ensure posted examples are: -- minimal – use as little code as possible that still produces the - same problem -- complete – provide all parts needed to reproduce the problem. Check - if you can strip external dependencies and still show the problem. The - less time we spend on reproducing problems the more time we have to - fix it -- verifiable – test the code you're about to provide to make sure it - reproduces the problem. Remove all other problems that are not - related to your request/question. - -For issues, please use the provided bug report and feature request templates. - -For questions, we recommend posting in our community -[GitHub Discussions.](https://github.com/triton-inference-server/server/discussions) - -## For more information - -Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server) -for more information. +> THIS BRANCH IS UNDER DEVELOPENT AND IS NOT YET STABLE.44 \ No newline at end of file diff --git a/TRITON_VERSION b/TRITON_VERSION index 8a3555062d..5b9cd9afd5 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.43.0dev +2.43.0 diff --git a/build.py b/build.py index 60ef90e441..325f0904e4 100755 --- a/build.py +++ b/build.py @@ -70,9 +70,9 @@ # incorrectly load the other version of the openvino libraries. # TRITON_VERSION_MAP = { - "2.43.0dev": ( - "24.02dev", # triton container - "24.01", # upstream container + "2.43.0": ( + "24.02", # triton container + "24.02", # upstream container "1.16.3", # ORT "2023.0.0", # ORT OpenVINO "2023.0.0", # Standalone OpenVINO @@ -1337,7 +1337,7 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine): COPY --from=min_container /opt/hpcx/ucx/lib/libucs.so.0 /opt/hpcx/ucx/lib/libucs.so.0 COPY --from=min_container /opt/hpcx/ucx/lib/libuct.so.0 /opt/hpcx/ucx/lib/libuct.so.0 -COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.8 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.8 +COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 # patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so RUN apt-get update && \ diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index aa15a50085..16ed8323d7 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.01-py3 + imageName: nvcr.io/nvidia/tritonserver:24.02-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 04998a3254..ad83541baf 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.42.0" +appVersion: "2.43.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index 35a778bfc4..655185c6a9 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.01-py3 + imageName: nvcr.io/nvidia/tritonserver:24.02-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -46,13 +46,13 @@ image: # Model Control Mode (Optional, default: none) # # To set model control mode, uncomment and configure below - # See https://github.com/triton-inference-server/server/blob/r24.01/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.02/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.01/README.md + # see https://github.com/triton-inference-server/server/blob/r24.02/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index 680bc266bc..264005b539 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.01-py3 + imageName: nvcr.io/nvidia/tritonserver:24.02-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index 80b4013645..a0d931f42d 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.01-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.02-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index 8c55ca8b90..952498c53f 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -28,8 +28,8 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver export MAJOR_VERSION=2.41 -export MINOR_VERSION=2.42.0 -export NGC_VERSION=24.01-py3 +export MINOR_VERSION=2.43.0 +export NGC_VERSION=24.02-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index bdb2725d7c..d973852daf 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -28,4 +28,4 @@ apiVersion: v1 appVersion: "2.41" description: Triton Inference Server name: triton-inference-server -version: 2.42.0 +version: 2.43.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 23728a6d91..3890f2b2f2 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -32,13 +32,13 @@ tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 modelRepositoryPath: gs://triton_sample_models/23_12 -publishedVersion: '2.42.0' +publishedVersion: '2.43.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.01-py3 + tag: 24.02-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 54e6498202..57ec3e892d 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.42.0' + publishedVersion: '2.43.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index f2f93f12de..1af8f82928 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.42.0' + publishedVersion: '2.43.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index cba0095594..8367057a33 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.01-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.02-py3 pip install onnx six torch tf2onnx tensorflow diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 54f0480d7f..6b0fd27acd 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.01-py3 + imageName: nvcr.io/nvidia/tritonserver:24.02-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index dad31bc412..b8db949363 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.01-py3 + imageName: nvcr.io/nvidia/tritonserver:24.02-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 70a258e60e..4c1cf44e78 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.01 branch, `` will default to r24.01. If you are +r24.02 branch, `` will default to r24.02. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then '' will default to "main". If you are building on a release branch then '' will default to the branch name. For example, if you -are building on the r24.01 branch, '' will default to -r24.01. Therefore, you typically do not need to provide '' will default to +r24.02. Therefore, you typically do not need to provide '' at all (nor the preceding colon). You can use a different '' for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index b632535a98..859ce91eba 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -44,8 +44,8 @@ from source to get more exact customization. The `compose.py` script can be found in the [server repository](https://github.com/triton-inference-server/server). Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. -For example branch [r24.01](https://github.com/triton-inference-server/server/tree/r24.01) -should be used to create a image based on the NGC 24.01 Triton release. +For example branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02) +should be used to create a image based on the NGC 24.02 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -76,19 +76,19 @@ For example, running ``` python3 compose.py --backend tensorflow1 --repoagent checksum ``` -on branch [r24.01](https://github.com/triton-inference-server/server/tree/r24.01) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.01-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.01-py3` +on branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.02-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.02-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.01 +python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.02 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.01-py3-min --image full,nvcr.io/nvidia/tritonserver:24.01-py3 +python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.02-py3-min --image full,nvcr.io/nvidia/tritonserver:24.02-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index 43144d6180..baa2676b6a 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp//qa_* -(for example /tmp/24.01/qa_model_repository). The TensorRT models +(for example /tmp/24.02/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 048860a24d..c82760544e 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.01 version of Triton, use the 24.01 version of the TensorRT +the 24.02 version of Triton, use the 24.02 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.01 version of Triton, use the 24.01 version of the TensorFlow +the 24.02 version of Triton, use the 24.02 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.01 version of Triton, use the 24.01 version of the PyTorch +the 24.02 version of Triton, use the 24.02 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index d79e0aa256..e226d02763 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -285,7 +285,7 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can ## Pinned Memory Metrics -Starting in 24.01, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. +Starting in 24.02, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. |Category |Metric |Metric Name |Description |Granularity|Frequency | |----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------| diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 6f0d3d24e8..b118eb3953 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.01-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.02-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.01-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.02-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md index 1abe019ea1..516135fcc4 100644 --- a/docs/user_guide/trace.md +++ b/docs/user_guide/trace.md @@ -595,7 +595,7 @@ The following table shows available OpenTelemetry trace APIs settings for ### OpenTelemetry Context Propagation Triton supports [context propagation](https://opentelemetry.io/docs/concepts/context-propagation/) -in OpenTelemetry mode starting in version 24.01. Note, that every request +in OpenTelemetry mode starting in version 24.02. Note, that every request with propagated OpenTelemetry context will be traced, regardless of `rate` and `count` trace settings. If a user wishes to trace only those requests, for which OpenTelemetry context was injected on the client side, please start Triton with diff --git a/qa/common/cuda_op_kernel.cu.cc.patch b/qa/common/cuda_op_kernel.cu.cc.patch index 24d915aa20..617521a0f9 100644 --- a/qa/common/cuda_op_kernel.cu.cc.patch +++ b/qa/common/cuda_op_kernel.cu.cc.patch @@ -7,7 +7,7 @@ index a9d66f9..a92e218 100644 #if GOOGLE_CUDA -#define EIGEN_USE_GPU --#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +-#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive -#include "tensorflow/core/util/gpu_kernel_helper.h" -#include "tensorflow/core/util/gpu_launch_config.h" +//#define EIGEN_USE_GPU diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 97b613cdc7..7d1416598a 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.01} +TRITON_VERSION=${TRITON_VERSION:=24.02} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index ab7caf4bc2..d13cff36af 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.01} +TRITON_VERSION=${TRITON_VERSION:=24.02} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index ad6ebcd827..8c7f958e25 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.01} +TRITON_VERSION=${TRITON_VERSION:=24.02} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.13.0