diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml index 29ce0c007..502e6db3e 100644 --- a/.pipelines/pypl-publishing.yml +++ b/.pipelines/pypl-publishing.yml @@ -29,6 +29,11 @@ parameters: type: boolean default: true +- name: enable_linux_rocm + displayName: 'Whether Linux ROCm package is built.' + type: boolean + default: true + - name: ort_version displayName: 'OnnxRuntime version' type: string @@ -49,6 +54,11 @@ parameters: type: string default: '1.18.0-dev-20240426-0116-b842effa29' +- name: ort_rocm_version + displayName: 'OnnxRuntime ROCm version' + type: string + default: '1.19.0-dev-20240602-1103-217b66f' + - name: cuda_versions displayName: 'CUDA versions' type: string @@ -77,6 +87,7 @@ stages: parameters: enable_linux_cpu: ${{ parameters.enable_linux_cpu }} enable_linux_cuda: ${{ parameters.enable_linux_cuda }} + enable_linux_rocm: ${{ parameters.enable_linux_rocm }} enable_win_cpu: ${{ parameters.enable_win_cpu }} enable_win_cuda: ${{ parameters.enable_win_cuda }} enable_win_dml: ${{ parameters.enable_win_dml }} @@ -84,6 +95,7 @@ stages: ort_version: ${{ parameters.ort_version }} ort_cuda_118_version: ${{ parameters.ort_cuda_118_version }} ort_cuda_122_version: ${{ parameters.ort_cuda_122_version }} + ort_rocm_version: ${{ parameters.ort_rocm_version }} ort_dml_version: ${{ parameters.ort_dml_version }} cuda_versions: ${{ parameters.cuda_versions }} build_config: ${{ parameters.build_config }} diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml index ada915890..d8e0ebc70 100644 --- a/.pipelines/stages/jobs/nuget-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-packaging-job.yml @@ -73,6 +73,8 @@ jobs: value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' ${{ elseif eq(parameters.ep, 'directml')}}: value: 'Microsoft.ML.OnnxRuntime.DirectML' + ${{ elseif eq(parameters.ep, 'rocm')}}: + value: 'Microsoft.ML.OnnxRuntime.ROCm' ${{ else }}: value: 'Microsoft.ML.OnnxRuntime' diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml index 103835472..400394b72 100644 --- a/.pipelines/stages/jobs/py-packaging-job.yml +++ b/.pipelines/stages/jobs/py-packaging-job.yml @@ -103,6 +103,8 @@ jobs: value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' ${{ elseif eq(parameters.ep, 'directml')}}: value: 'Microsoft.ML.OnnxRuntime.DirectML' + ${{ elseif eq(parameters.ep, 'rocm')}}: + value: 'Microsoft.ML.OnnxRuntime.ROCm' ${{ else }}: value: 'Microsoft.ML.OnnxRuntime' diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml index b4c77e97c..cb382d7c0 100644 --- a/.pipelines/stages/py-packaging-stage.yml +++ b/.pipelines/stages/py-packaging-stage.yml @@ -11,12 +11,16 @@ parameters: type: boolean - name: enable_linux_cuda type: boolean +- name: enable_linux_rocm + type: boolean - name: ort_version type: string - name: ort_cuda_118_version type: string - name: ort_cuda_122_version type: string +- name: ort_rocm_version + type: string - name: ort_dml_version type: string - name: cuda_versions @@ -109,7 +113,14 @@ stages: os: 'linux' build_config: ${{ parameters.build_config }} - + - ${{ if eq(parameters.enable_linux_rocm, true) }}: + - template: jobs/py-packaging-job.yml + parameters: + arch: 'x64' + ep: 'rocm' + ort_version: ${{ parameters.ort_rocm_version }} + os: 'linux' + build_config: ${{ parameters.build_config }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 10275e702..73e6b0537 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,8 @@ include(cmake/external/onnxruntime_external_deps.cmake) include(cmake/global_variables.cmake) # Checking if CUDA is supported include(cmake/check_cuda.cmake) +# Checking if ROCm is supported +include(cmake/check_rocm.cmake) # Checking if DML is supported include(cmake/check_dml.cmake) diff --git a/build.py b/build.py index 055203751..cb98cf336 100644 --- a/build.py +++ b/build.py @@ -101,6 +101,8 @@ def _parse_args(): "Used when --use_cuda is specified.", ) + parser.add_argument("--use_rocm", action="store_true", help="Whether to use ROCm. Default is to not use rocm.") + parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") # The following options are mutually exclusive (cross compiling options such as android, ios, etc.) @@ -443,6 +445,7 @@ def update(args: argparse.Namespace, env: dict[str, str]): str(args.build_dir), "-DCMAKE_POSITION_INDEPENDENT_CODE=ON", f"-DUSE_CUDA={'ON' if args.use_cuda else 'OFF'}", + f"-DUSE_ROCM={'ON' if args.use_rocm else 'OFF'}", f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}", f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}", f"-DBUILD_WHEEL={build_wheel}", @@ -562,4 +565,4 @@ def clean(args: argparse.Namespace, env: dict[str, str]): build(arguments, environment) if arguments.test and not arguments.skip_tests: - test(arguments, environment) \ No newline at end of file + test(arguments, environment) diff --git a/cmake/check_rocm.cmake b/cmake/check_rocm.cmake new file mode 100644 index 000000000..9526449b1 --- /dev/null +++ b/cmake/check_rocm.cmake @@ -0,0 +1,8 @@ +if(USE_ROCM AND NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_ROCM_LIB}") + message(FATAL_ERROR "Expected the ONNX Runtime providers ROCm library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_ROCM_LIB}. Actual: Not found.") +endif() + +if(USE_ROCM) + list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_ROCM_LIB}") + add_compile_definitions(USE_ROCM=1) +endif() \ No newline at end of file diff --git a/cmake/global_variables.cmake b/cmake/global_variables.cmake index 2fbf298bb..22dd8a6ad 100644 --- a/cmake/global_variables.cmake +++ b/cmake/global_variables.cmake @@ -27,16 +27,19 @@ endif() if(WIN32) set(ONNXRUNTIME_LIB "onnxruntime.dll") set(ONNXRUNTIME_PROVIDERS_CUDA_LIB "onnxruntime_providers_cuda.dll") + set(ONNXRUNTIME_PROVIDERS_ROCM_LIB "onnxruntime_providers_rocm.dll") set(ONNXRUNTIME_ALL_SHARED_LIBS "onnxruntime*.dll") set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.lib") set(ONNXRUNTIME_EXTENSIONS_FILES "tfmtok_c.dll") elseif(APPLE) set(ONNXRUNTIME_LIB "libonnxruntime.dylib") set(ONNXRUNTIME_PROVIDERS_CUDA_LIB "libonnxruntime_providers_cuda.dylib") + set(ONNXRUNTIME_PROVIDERS_ROCM_LIB "libonnxruntime_providers_rocm.dylib") set(ONNXRUNTIME_ALL_SHARED_LIBS "libonnxruntime*.dylib") else() set(ONNXRUNTIME_LIB "libonnxruntime.so") set(ONNXRUNTIME_PROVIDERS_CUDA_LIB "libonnxruntime_providers_cuda.so") + set(ONNXRUNTIME_PROVIDERS_ROCM_LIB "libonnxruntime_providers_rocm.so") set(ONNXRUNTIME_ALL_SHARED_LIBS "libonnxruntime*.so*") set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so") endif() diff --git a/cmake/options.cmake b/cmake/options.cmake index c1aeaa1a5..2aec9d3b2 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -2,6 +2,7 @@ include(CMakeDependentOption) # features option(USE_CUDA "Build with CUDA support" ON) +option(USE_ROCM "Build with ROCm support" ON) option(USE_DML "Build with DML support" OFF) # bindings diff --git a/cmake/package.cmake b/cmake/package.cmake index 49b6e69a1..87ad79345 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -38,6 +38,8 @@ elseif (LINUX) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x64") if (USE_CUDA) set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-linux-x64-cuda") + elseif (USE_ROCM) + set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-linux-x64-rocm") else () set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-linux-x64") endif () diff --git a/cmake/presets/CMakeLinuxBuildPresets.json b/cmake/presets/CMakeLinuxBuildPresets.json index 8155e2285..47a0ec704 100644 --- a/cmake/presets/CMakeLinuxBuildPresets.json +++ b/cmake/presets/CMakeLinuxBuildPresets.json @@ -100,6 +100,38 @@ { "name": "linux_gcc_cuda_minsizerel", "configurePreset": "linux_gcc_cuda_minsizerel" + }, + { + "name": "linux_gcc_rocm_release_asan", + "configurePreset": "linux_gcc_rocm_release_asan" + }, + { + "name": "linux_gcc_rocm_debug_asan", + "configurePreset": "linux_gcc_rocm_debug_asan" + }, + { + "name": "linux_gcc_rocm_relwithdebinfo_asan", + "configurePreset": "linux_gcc_rocm_relwithdebinfo_asan" + }, + { + "name": "linux_gcc_rocm_minsizerel_asan", + "configurePreset": "linux_gcc_rocm_minsizerel_asan" + }, + { + "name": "linux_gcc_rocm_release", + "configurePreset": "linux_gcc_rocm_release" + }, + { + "name": "linux_gcc_rocm_debug", + "configurePreset": "linux_gcc_rocm_debug" + }, + { + "name": "linux_gcc_rocm_relwithdebinfo", + "configurePreset": "linux_gcc_rocm_relwithdebinfo" + }, + { + "name": "linux_gcc_rocm_minsizerel", + "configurePreset": "linux_gcc_rocm_minsizerel" } ] } \ No newline at end of file diff --git a/cmake/presets/CMakeLinuxDefaultConfigPresets.json b/cmake/presets/CMakeLinuxDefaultConfigPresets.json index 559d1dae0..c2816213f 100644 --- a/cmake/presets/CMakeLinuxDefaultConfigPresets.json +++ b/cmake/presets/CMakeLinuxDefaultConfigPresets.json @@ -9,7 +9,8 @@ "CMAKE_EXE_LINKER_FLAGS_INIT": "-Wl,-z,now", "CMAKE_MODULE_LINKER_FLAGS_INIT": "-Wl,-z,now", "CMAKE_SHARED_LINKER_FLAGS_INIT": "-Wl,-z,now", - "USE_CUDA": "OFF" + "USE_CUDA": "OFF", + "USE_ROCM": "OFF" }, "environment": { "CC": "gcc", @@ -29,6 +30,13 @@ "CMAKE_CUDA_ARCHITECTURES": "60;61;70;75;80;86" } }, + { + "name": "linux_gcc_rocm_default", + "inherits": "linux_gcc_default", + "cacheVariables": { + "USE_ROCM": "ON" + } + }, { "name": "linux_clang_default", "inherits": "linux_gcc_default", diff --git a/cmake/presets/CMakeLinuxGccConfigPresets.json b/cmake/presets/CMakeLinuxGccConfigPresets.json index d5518f9ad..bdd58f6c1 100644 --- a/cmake/presets/CMakeLinuxGccConfigPresets.json +++ b/cmake/presets/CMakeLinuxGccConfigPresets.json @@ -155,6 +155,82 @@ "linux_minsizerel_default" ], "binaryDir": "${sourceDir}/build/cuda" + }, + { + "name": "linux_gcc_rocm_release_asan", + "displayName": "linux gcc rocm release asan", + "inherits": [ + "linux_gcc_asan_default", + "linux_gcc_rocm_default", + "linux_release_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_debug_asan", + "displayName": "linux gcc rocm debug asan", + "inherits": [ + "linux_gcc_asan_default", + "linux_gcc_rocm_default", + "linux_debug_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_relwithdebinfo_asan", + "displayName": "linux gcc rocm relwithdebinfo asan", + "inherits": [ + "linux_gcc_asan_default", + "linux_gcc_rocm_default", + "linux_relwithdebinfo_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_minsizerel_asan", + "displayName": "linux gcc rocm minsizerel asan", + "inherits": [ + "linux_gcc_asan_default", + "linux_gcc_rocm_default", + "linux_minsizerel_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_release", + "displayName": "linux gcc rocm release", + "inherits": [ + "linux_gcc_rocm_default", + "linux_release_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_debug", + "displayName": "linux gcc rocm debug", + "inherits": [ + "linux_gcc_rocm_default", + "linux_debug_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_relwithdebinfo", + "displayName": "linux gcc rocm relwithdebinfo", + "inherits": [ + "linux_gcc_rocm_default", + "linux_relwithdebinfo_default" + ], + "binaryDir": "${sourceDir}/build/rocm" + }, + { + "name": "linux_gcc_rocm_minsizerel", + "displayName": "linux gcc rocm minsizerel", + "inherits": [ + "linux_gcc_rocm_default", + "linux_minsizerel_default" + ], + "binaryDir": "${sourceDir}/build/rocm" } ] } \ No newline at end of file diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json index 1ea6d85c8..a425dc595 100644 --- a/cmake/presets/CMakeMacOSConfigPresets.json +++ b/cmake/presets/CMakeMacOSConfigPresets.json @@ -11,7 +11,8 @@ "cacheVariables": { "CMAKE_POSITION_INDEPENDENT_CODE": "ON", "CMAKE_OSX_ARCHITECTURES": "arm64", - "USE_CUDA": "OFF" + "USE_CUDA": "OFF", + "USE_ROCM": "OFF" }, "environment": { "CC": "clang", diff --git a/cmake/presets/CMakeWinConfigPresets.json b/cmake/presets/CMakeWinConfigPresets.json index ddbfbacfd..f75549694 100644 --- a/cmake/presets/CMakeWinConfigPresets.json +++ b/cmake/presets/CMakeWinConfigPresets.json @@ -11,7 +11,8 @@ "CMAKE_EXE_LINKER_FLAGS_INIT": "/profile /DYNAMICBASE", "CMAKE_MODULE_LINKER_FLAGS_INIT": "/profile /DYNAMICBASE", "CMAKE_SHARED_LINKER_FLAGS_INIT": "/profile /DYNAMICBASE", - "USE_CUDA": "OFF" + "USE_CUDA": "OFF", + "USE_ROCM": "OFF" }, "condition": { "type": "equals", diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 383298f77..9bc5ea1ab 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -33,6 +33,8 @@ if(BUILD_WHEEL) message("Setting up wheel files in : ${WHEEL_FILES_DIR}") if(USE_CUDA) set(TARGET_NAME "onnxruntime-genai-cuda") + elseif(USE_ROCM) + set(TARGET_NAME "onnxruntime-genai-rocm") elseif(USE_DML) set(TARGET_NAME "onnxruntime-genai-directml") else() @@ -73,6 +75,19 @@ if(BUILD_WHEEL) "libnvonnxparser.so.8" "libnvonnxparser.so.10" + "libamdhip64.so.5" + "libamdhip64.so.6" + "libhipblas.so.0" + "libhipblas.so.2" + "libhipfft.so" + "libhipfft.so.0" + "libhiprtc.so.5" + "libhsa-runtime64.so.1" + "librccl.so.1" + "librocblas.so.3" + "librocfft.so.0" + "libroctracer64.so.4" + "libMIOpen.so.1" ) set(modified_exclude_list) foreach(item IN LISTS auditwheel_exclude_list) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 8da1e8046..a6366ebaa 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -58,6 +58,10 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "cuda": { "enable_cuda_graph": enable_cuda_graph, # "1" if the the model is able to enable cuda graph, "0" otherwise }, + "rocm":{ + "tunable_op_enable": "1", + "tunable_op_tuning_enable": "1" + }, "dml": {}, "web": {}, } @@ -1960,7 +1964,7 @@ def make_attention_mask_reformatting_for_gqa(self): self.mask_attrs["total_seq_len"] = cast_2_name def make_attention_mask_reformatting_for_sparse_attn(self): - # Make nodes for the attention mask subgraph that calculates + # Make nodes for the attention mask subgraph that calculates # attributes about the 2D attention mask to use in SparseAttention # # attention_mask @@ -2178,7 +2182,7 @@ def calculate_block_mask(self): crows_name = "block_row_indices" self.make_external_tensor(crows.detach().numpy().astype(np.int32), crows_name) self.mask_attrs["block_row_indices"] = crows_name - + cols_name = "block_col_indices" self.make_external_tensor(cols.detach().numpy().astype(np.int32), cols_name) self.mask_attrs["block_col_indices"] = cols_name @@ -2248,7 +2252,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input): # DownProjMatMul # | # DownProjAdd - + # Make input MatMul and Add nodes up_matmul_name = f"/model/layers.{layer_id}/mlp/up_proj/MatMul" self.make_matmul(mlp.up_proj.weight.detach().numpy(), up_matmul_name, root_input) diff --git a/src/python/python.cpp b/src/python/python.cpp index c22c31582..a2ea0bad8 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -495,6 +495,14 @@ PYBIND11_MODULE(onnxruntime_genai, m) { #endif }); + m.def("is_rocm_available", []() { +#if USE_ROCM + return true; +#else + return false; +#endif + }); + m.def("set_current_gpu_device_id", [](int device_id) { Ort::SetCurrentGpuDeviceId(device_id); }); m.def("get_current_gpu_device_id", []() { return Ort::GetCurrentGpuDeviceId(); }); } diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index 7780bb24c..0cbf52e08 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -18,6 +18,9 @@ if og.is_dml_available(): devices.append("dml") +if og.is_rocm_available(): + devices.append("rocm") + @pytest.mark.parametrize( "relative_model_path", ( diff --git a/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm new file mode 100644 index 000000000..6af5a8a21 --- /dev/null +++ b/tools/ci_build/github/linux/docker/manylinux/Dockerfile.manylinux2_28_rocm @@ -0,0 +1,11 @@ +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3 + +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_centos_gcc12.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts + +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER +