Merge branch 'main' of github.com:triton-inference-server/server into…

… yinggeh-upgrade-openvino-model-version-24.12
triton-inference-server · Dec 20, 2024 · d2bdd8d · d2bdd8d
2 parents 1abd8fc + 0194c3d
commit d2bdd8d
Show file tree

Hide file tree

Showing 50 changed files with 6,229 additions and 1,006 deletions.
diff --git a/build.py b/build.py
@@ -565,7 +565,7 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
     elif be == "tensorflow":
         args = tensorflow_cmake_args(images, library_paths)
     elif be == "python":
-        args = []
+        args = python_cmake_args()
     elif be == "dali":
         args = dali_cmake_args()
     elif be == "pytorch":
@@ -631,6 +631,18 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
     return cargs
 
 
+def python_cmake_args():
+    cargs = []
+    if target_platform() == "rhel":
+        cargs.append(
+            cmake_backend_arg(
+                "python", "PYBIND11_PYTHON_VERSION", "STRING", FLAGS.rhel_py_version
+            )
+        )
+
+    return cargs
+
+
 def pytorch_cmake_args(images):
     if "pytorch" in images:
         image = images["pytorch"]
@@ -924,6 +936,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
     df += """
 # Install docker docker buildx
@@ -957,6 +970,10 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
             pkg-config \\
             unzip \\
             wget \\
+            ncurses-devel \\
+            readline-devel \\
+            xz-devel \\
+            bzip2-devel \\
             zlib-devel \\
             libarchive-devel \\
             libxml2-devel \\
@@ -1025,6 +1042,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
     # Install the windows- or linux-specific buildbase dependencies
     if target_platform() == "windows":
@@ -1035,7 +1053,6 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
         df += """
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 
 # Install docker docker buildx
 RUN apt-get update \\
@@ -1159,6 +1176,7 @@ def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
 
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
@@ -1198,6 +1216,8 @@ def create_dockerfile_linux(
 ##  Production stage: Create container with just inference server executable
 ############################################################################
 FROM ${BASE_IMAGE}
+
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     df += dockerfile_prepare_container_linux(
@@ -1399,7 +1419,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
     if "python" in backends:
         if target_platform() == "rhel":
             df += """
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 # python3, python3-pip and some pip installs required for the python backend
 RUN yum install -y \\
         libarchive-devel \\
@@ -1418,7 +1437,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 """
         else:
             df += """
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 # python3, python3-pip and some pip installs required for the python backend
 RUN apt-get update \\
       && apt-get install -y --no-install-recommends \\
@@ -1542,7 +1560,7 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 
 
 def change_default_python_version_rhel(version):
-    df = """
+    df = f"""
 # The python library version available for install via 'yum install python3.X-devel' does not
 # match the version of python inside the RHEL base container. This means that python packages
 # installed within the container will not be picked up by the python backend stub process pybind
@@ -1551,21 +1569,17 @@ def change_default_python_version_rhel(version):
 RUN curl https://pyenv.run | bash
 ENV PATH="${{PYENV_ROOT}}/bin:$PATH"
 RUN eval "$(pyenv init -)"
-RUN CONFIGURE_OPTS=\"--with-openssl=/usr/lib64\" && pyenv install {} \\
-    && cp ${{PYENV_ROOT}}/versions/{}/lib/libpython3* /usr/lib64/""".format(
-        version, version
-    )
-    df += """
+RUN CONFIGURE_OPTS=\"--with-openssl=/usr/lib64\" && pyenv install {version} \\
+    && cp ${{PYENV_ROOT}}/versions/{version}/lib/libpython3* /usr/lib64/
+
 # RHEL image has several python versions. It's important
 # to set the correct version, otherwise, packages that are
 # pip installed will not be found during testing.
-ENV PYVER={} PYTHONPATH=/opt/python/v
+ENV PYVER={version} PYTHONPATH=/opt/python/v
 RUN ln -sf ${{PYENV_ROOT}}/versions/${{PYVER}}* ${{PYTHONPATH}}
 ENV PYBIN=${{PYTHONPATH}}/bin
 ENV PYTHON_BIN_PATH=${{PYBIN}}/python${{PYVER}} PATH=${{PYBIN}}:${{PATH}}
-""".format(
-        version
-    )
+"""
     return df
 
 

diff --git a/docs/Dockerfile.docs b/docs/Dockerfile.docs
@@ -1,4 +1,4 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -59,6 +59,7 @@ RUN pip3 install \
       breathe \
       docutils \
       exhale \
+      httplib2 \
       ipython \
       myst-nb \
       nbclient \
@@ -73,6 +74,12 @@ RUN pip3 install \
       sphinx-tabs \
       sphinxcontrib-bibtex
 
+
+# install nvidia-sphinx-theme
+RUN pip3 install \
+      --index-url https://urm.nvidia.com/artifactory/api/pypi/ct-omniverse-pypi/simple/ \
+      nvidia-sphinx-theme
+
 # Set visitor script to be included on every HTML page
 ENV VISITS_COUNTING_SCRIPT="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js"
 
diff --git a/docs/README.md b/docs/README.md
@@ -124,9 +124,9 @@ Triton supports batching individual inference requests to improve compute resour
   - [Queuing Policies](user_guide/model_configuration.md#queue-policy)
   - [Ragged Batching](user_guide/ragged_batching.md)
 - [Sequence Batcher](user_guide/model_configuration.md#sequence-batcher)
-  - [Stateful Models](user_guide/architecture.md#stateful-models)
-  - [Control Inputs](user_guide/architecture.md#control-inputs)
-  - [Implicit State - Stateful Inference Using a Stateless Model](user_guide/architecture.md#implicit-state-management)
+  - [Stateful Models](user_guide/model_execution.md#stateful-models)
+  - [Control Inputs](user_guide/model_execution.md#control-inputs)
+  - [Implicit State - Stateful Inference Using a Stateless Model](user_guide/implicit_state_management.md#implicit-state-management)
   - [Sequence Scheduling Strategies](user_guide/architecture.md#scheduling-strategies)
     - [Direct](user_guide/architecture.md#direct)
     - [Oldest](user_guide/architecture.md#oldest)

diff --git a/docs/backend_guide/vllm.rst b/docs/backend_guide/vllm.rst
@@ -0,0 +1,11 @@
+########
+vLLM
+########
+
+.. toctree::
+    :hidden:
+    :caption: vLLM
+    :maxdepth: 2
+
+    ../vllm_backend/README
+    Multi-LoRA <../vllm_backend/docs/llama_multi_lora_tutorial>
diff --git a/docs/client_guide/api_reference.rst b/docs/client_guide/api_reference.rst
@@ -0,0 +1,10 @@
+####
+API Reference
+####
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   OpenAI API <openai_readme.md>
+   kserve
diff --git a/docs/client_guide/in_process.rst b/docs/client_guide/in_process.rst
@@ -0,0 +1,39 @@
+####
+In-Process Triton Server API
+####
+
+
+The Triton Inference Server provides a backwards-compatible C API/ python-bindings/java-bindings that
+allows Triton to be linked directly into a C/C++/java/python application. This API
+is called the "Triton Server API" or just "Server API" for short. The
+API is implemented in the Triton shared library which is built from
+source contained in the `core
+repository <https://github.com/triton-inference-server/core>`__. On Linux
+this library is libtritonserver.so and on Windows it is
+tritonserver.dll. In the Triton Docker image the shared library is
+found in /opt/tritonserver/lib. The header file that defines and
+documents the Server API is
+`tritonserver.h <https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h>`__.
+`Java bindings for In-Process Triton Server API <../customization_guide/inprocess_java_api.html#java-bindings-for-in-process-triton-server-api>`__
+are built on top of `tritonserver.h` and can be used for Java applications that
+need to use Tritonserver in-process.
+
+All capabilities of Triton server are encapsulated in the shared
+library and are exposed via the Server API. The `tritonserver`
+executable implements HTTP/REST and GRPC endpoints and uses the Server
+API to communicate with core Triton logic. The primary source files
+for the endpoints are `grpc_server.cc <https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc>`__ and
+`http_server.cc <https://github.com/triton-inference-server/server/blob/main/src/http_server.cc>`__. In these source files you can
+see the Server API being used.
+
+You can use the Server API in your own application as well. A simple
+example using the Server API can be found in
+`simple.cc <https://github.com/triton-inference-server/server/blob/main/src/simple.cc>`__.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   C/C++ <../customization_guide/inprocess_c_api.md>
+   python
+   Java <../customization_guide/inprocess_java_api.md>
diff --git a/docs/client_guide/kserve.rst b/docs/client_guide/kserve.rst
@@ -0,0 +1,15 @@
+####
+KServe API
+####
+
+
+Triton uses the
+`KServe community standard inference protocols <https://github.com/kserve/kserve/tree/master/docs/predict-api/v2>`__
+to define HTTP/REST and GRPC APIs plus several extensions.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   HTTP/REST and GRPC Protocol <../customization_guide/inference_protocols.md>
+   kserve_extension
diff --git a/docs/client_guide/kserve_extension.rst b/docs/client_guide/kserve_extension.rst
@@ -0,0 +1,24 @@
+####
+Extensions
+####
+
+To fully enable all capabilities
+Triton also implements `HTTP/REST and GRPC
+extensions <https://github.com/triton-inference-server/server/tree/main/docs/protocol>`__
+to the KServe inference protocol.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Binary tensor data extension <../protocol/extension_binary_data.md>
+   Classification extension <../protocol/extension_classification.md>
+   Schedule policy extension <../protocol/extension_schedule_policy.md>
+   Sequence extension <../protocol/extension_sequence.md>
+   Shared-memory extension <../protocol/extension_shared_memory.md>
+   Model configuration extension <../protocol/extension_model_configuration.md>
+   Model repository extension <../protocol/extension_model_repository.md>
+   Statistics extension <../protocol/extension_statistics.md>
+   Trace extension <../protocol/extension_trace.md>
+   Logging extension <../protocol/extension_logging.md>
+   Parameters extension <../protocol/extension_parameters.md>
diff --git a/docs/client_guide/openai_readme.md b/docs/client_guide/openai_readme.md
@@ -0,0 +1 @@
+../../python/openai/README.md
diff --git a/docs/client_guide/python.rst b/docs/client_guide/python.rst
@@ -0,0 +1,12 @@
+####
+Python
+####
+
+.. include:: python_readme.rst
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Kafka I/O <../tutorials/Triton_Inference_Server_Python_API/examples/kafka-io/README.md>
+   Rayserve <../tutorials/Triton_Inference_Server_Python_API/examples/rayserve/README.md>