From a5cf548388e0e923189baa41ad56e31febf1d8a5 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 17 Dec 2024 12:43:35 -0500 Subject: [PATCH] [NFC] GPU ukernels cleanups (#19503) 1. Rename `UKernelSpec` to `UKernelConfig`. I was grappling for the right word, but now that it's part of `LoweringConfig`, it's clearer. 2. Drop unused `KernelConfig` case for ukernel ops. The lowering to ukernel ops happens after `KernelConfig`. 3. To stringify types, instead of using a stringstream, we can actually just use `llvm::formatv`. 4. Reorganize LLVMGPUSelectUKernels.cpp to make it easier to add logic for other ukernels. Signed-off-by: Benoit Jacob --- .../test/config_ukernel_argmax_gfx942.mlir | 14 +-- .../Codegen/Common/GPU/GPULowerToUKernels.cpp | 2 +- .../GPU/test/gpu_lower_to_ukernels.mlir | 2 +- .../Dialect/GPU/IR/GPULoweringConfigUtils.cpp | 4 +- .../Dialect/GPU/IR/GPULoweringConfigUtils.h | 3 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 8 +- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 9 +- .../LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp | 112 +++++++++++------- .../LLVMGPU/Utils/LLVMGPUSelectUKernels.h | 2 +- 9 files changed, 90 insertions(+), 66 deletions(-) diff --git a/compiler/plugins/target/ROCM/test/config_ukernel_argmax_gfx942.mlir b/compiler/plugins/target/ROCM/test/config_ukernel_argmax_gfx942.mlir index 4a7da4befadd..9a537875c6ab 100644 --- a/compiler/plugins/target/ROCM/test/config_ukernel_argmax_gfx942.mlir +++ b/compiler/plugins/target/ROCM/test/config_ukernel_argmax_gfx942.mlir @@ -25,7 +25,7 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes // CHECK: linalg.generic // CHECK-SAME: hal.executable.objects = [ // CEHCK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource : vector<{{[0-9]+}}xi8>}>] -// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec +// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config // ----- @@ -54,7 +54,7 @@ func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor // CHECK: linalg.generic // CHECK-SAME: hal.executable.objects = [ // CEHCK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource : vector<{{[0-9]+}}xi8>}>] -// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec +// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config // ----- @@ -82,7 +82,7 @@ func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> // CHECK-LABEL: func @argmax_none_ukernel_enabled( // CHECK: linalg.generic // CHECK-NOT: hal.executable.objects -// CHECK-NOT: iree_gpu.ukernel_spec +// CHECK-NOT: iree_gpu.ukernel_config // ----- @@ -111,7 +111,7 @@ func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor // CHECK: linalg.generic // CHECK-SAME: hal.executable.objects = [ // CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource : vector<{{[0-9]+}}xi8>}>] -// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec +// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config // ----- @@ -140,7 +140,7 @@ func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) - // CHECK: linalg.generic // CHECK-SAME: hal.executable.objects = [ // CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource : vector<{{[0-9]+}}xi8>}>] -// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec +// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config // ----- @@ -168,7 +168,7 @@ func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1x // CHECK-LABEL: func @argmax_only_foo_ukernel_enabled( // CHECK: linalg.generic // CHECK-NOT: hal.executable.objects -// CHECK-NOT: iree_gpu.ukernel_spec +// CHECK-NOT: iree_gpu.ukernel_config // ----- @@ -239,4 +239,4 @@ func.func @argmax_2d_f32i64_custom_bitcode(%arg0 : tensor<1x?xf32>) -> tensor<1x // CHECK-SAME: data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8> // CHECK-SAME: }> // CHECK-SAME: ] -// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec +// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp index 796138d55e3f..fd58c29d2654 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp @@ -43,7 +43,7 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) { if (!loweringConfig) { return rewriter.notifyMatchFailure(op, "no lowering_config on this op"); } - IREE::GPU::UKernelSpecAttr ukernelAttr = + IREE::GPU::UKernelConfigAttr ukernelAttr = IREE::GPU::getUkernelSpec(loweringConfig); if (!ukernelAttr) { return rewriter.notifyMatchFailure(op, "no ukernel selected for this op"); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir index 6a13468a1d29..7acab19f945a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir @@ -1,6 +1,6 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s -#config = #iree_gpu.lowering_config<{ukernel = #iree_gpu.ukernel_spec}> +#config = #iree_gpu.lowering_config<{ukernel = #iree_gpu.ukernel_config}> func.func @argmax_f32i64_with_selected_ukernel(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes { hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}> } { diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp index 8ebfba912442..df85e48a7379 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp @@ -145,9 +145,9 @@ std::optional> getPaddingList(LoweringConfigAttr config) { return getIntegerVector(array); } -IREE::GPU::UKernelSpecAttr +IREE::GPU::UKernelConfigAttr getUkernelSpec(IREE::GPU::LoweringConfigAttr config) { - return config.getAttributes().getAs("ukernel"); + return config.getAttributes().getAs("ukernel"); } } // namespace mlir::iree_compiler::IREE::GPU diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h index 5bebb64a1b05..b6afde5d4dd4 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h @@ -59,7 +59,8 @@ void setPromotedOperandList(MLIRContext *context, /// Helper to retrieve list of operand to pad. std::optional> getPaddingList(LoweringConfigAttr config); -IREE::GPU::UKernelSpecAttr getUkernelSpec(IREE::GPU::LoweringConfigAttr config); +IREE::GPU::UKernelConfigAttr +getUkernelSpec(IREE::GPU::LoweringConfigAttr config); } // namespace mlir::iree_compiler::IREE::GPU diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 0b1e32fdc362..e4b66bffbd89 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -521,12 +521,12 @@ def IREEGPU_LaneIdAttr : AttrDef { - let mnemonic = "ukernel_spec"; +def IREEGPU_UKernelConfigAttr : + AttrDef { + let mnemonic = "ukernel_config"; let summary = "An attribute specifying a ukernel that an op can lower to."; let description = [{ An attribute that can be applied to any operation to specify that it has diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 1f44bf693a55..fbc0f37a129b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -2103,14 +2103,11 @@ static LogicalResult setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, linalg::GenericOp op) { - // Checks if UKernels are enabled. - IREE::GPU::UKernelSpecAttr ukernelSpec = selectUKernelForArgmax(op); - if (!ukernelSpec) { + IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op); + if (!ukernelConfig) { return failure(); } - if (failed(isArgmaxOp(op))) - return failure(); SmallVector parallelDims; SmallVector reductionDims; op.getParallelDims(parallelDims); @@ -2161,7 +2158,7 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, b.getI64ArrayAttr(workgroupTileSizes)); attrs.emplace_back(StringAttr::get(context, "reduction"), b.getI64ArrayAttr(reductionTileSizes)); - attrs.emplace_back(StringAttr::get(context, "ukernel"), ukernelSpec); + attrs.emplace_back(StringAttr::get(context, "ukernel"), ukernelConfig); IREE::GPU::setPromotedOperandList(context, attrs, {0, 1}); auto configDict = DictionaryAttr::get(context, attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp index 1940e8f0b102..2f2861f926cc 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp @@ -18,7 +18,49 @@ namespace mlir::iree_compiler { namespace { -constexpr StringLiteral executableObjectsAttrName = "hal.executable.objects"; +// Returns ukernel name and suffix for argmax. Empty name = no ukernel. +static std::tuple +getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) { + Value input = op.getDpsInputOperand(0)->get(); + auto inputType = cast(input.getType()); + Value index = op.getDpsInitOperand(1)->get(); + auto indexType = cast(index.getType()); + return {"argmax", llvm::formatv("{}{}", inputType.getElementType(), + indexType.getElementType())}; +} + +// Returns ukernel name and suffix for any op. Empty name = no ukernel. +static std::tuple +getUKernelNameAndSuffix(Operation *op) { + if (auto genericOp = dyn_cast(op)) { + if (succeeded(isArgmaxOp(genericOp))) { + return getUKernelNameAndSuffixForArgmax(genericOp); + } + } + return {}; +} + +// Returns the UKernelConfigAttr for any op. Returns {} if no ukernel. +static IREE::GPU::UKernelConfigAttr getUKernelConfig(Operation *op) { + MLIRContext *context = op->getContext(); + auto [name, suffix] = getUKernelNameAndSuffix(op); + if (name.empty() || suffix.empty()) { + return {}; + } + auto target = IREE::HAL::ExecutableTargetAttr::lookup(op); + if (!hasUkernel(target, name)) { + return {}; + } + if (isROCMBackend(target)) { + auto nameAttr = StringAttr::get( + context, llvm::formatv("iree_uk_amdgpu_{}_{}", name, suffix)); + auto defsAttr = DictionaryAttr::get( + context, {{StringAttr::get(context, "vm.import.module"), + StringAttr::get(context, "rocm")}}); + return IREE::GPU::UKernelConfigAttr::get(context, nameAttr, defsAttr); + } + return {}; +} // Returns a ExecutableObjectAttr carrying the bitcode for the given ukernel. // @@ -77,7 +119,8 @@ getUKernelBitcode(MLIRContext *context, // array attribute. If the parent hal.executable.variant is reached, its objects // attribute is returned. // Adapted from ExecutableTargetAttr::lookup. -static ArrayAttr lookUpExecutableObjects(Operation *op) { +static ArrayAttr lookUpExecutableObjects(Operation *op, + StringRef executableObjectsAttrName) { MLIRContext *context = op->getContext(); auto attrId = StringAttr::get(context, executableObjectsAttrName); while (op) { @@ -97,56 +140,39 @@ static ArrayAttr lookUpExecutableObjects(Operation *op) { return {}; } -/// Returns the function name and attributes to use for a ukernel with given -/// `name` and `suffix` on the target described by `targetAttr`. -static IREE::GPU::UKernelSpecAttr -getUKernelSpec(StringRef name, StringRef suffix, MLIRContext *context, - IREE::HAL::ExecutableTargetAttr targetAttr) { - if (isROCMBackend(targetAttr)) { - auto nameAttr = StringAttr::get( - context, llvm::formatv("iree_uk_amdgpu_{}_{}", name, suffix)); - auto defsAttr = DictionaryAttr::get( - context, {{StringAttr::get(context, "vm.import.module"), - StringAttr::get(context, "rocm")}}); - return IREE::GPU::UKernelSpecAttr::get(context, nameAttr, defsAttr); +// Ensures that the op has ukernel bitcode as a hal.executable.object, stored +// as a hal.executable.objects attribute on the op itself, ready to be hoisted +// by the HoistExecutableObjects pass. +// Returns failure if no bitcode was found for the configured ukernel. +static LogicalResult +ensureUKernelBitcode(Operation *op, + IREE::GPU::UKernelConfigAttr ukernelConfig) { + constexpr StringLiteral executableObjectsAttrName = "hal.executable.objects"; + auto target = IREE::HAL::ExecutableTargetAttr::lookup(op); + ArrayAttr sourceExecutableObjects = + lookUpExecutableObjects(op, executableObjectsAttrName); + MLIRContext *context = op->getContext(); + IREE::HAL::ExecutableObjectAttr bitcodeObject = getUKernelBitcode( + context, target, sourceExecutableObjects, ukernelConfig.getName()); + if (!bitcodeObject) { + return failure(); } - return {}; + op->setAttr(executableObjectsAttrName, + ArrayAttr::get(context, bitcodeObject)); + return success(); } } // namespace -IREE::GPU::UKernelSpecAttr selectUKernelForArgmax(linalg::GenericOp op) { - if (failed(isArgmaxOp(op))) { - return {}; - } - auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op); - const char ukernelName[] = "argmax"; - if (!hasUkernel(targetAttr, ukernelName)) { - return {}; - } - Value input = op.getDpsInputOperand(0)->get(); - auto inputType = cast(input.getType()); - Value index = op.getDpsInitOperand(1)->get(); - auto indexType = cast(index.getType()); - std::string suffix; - llvm::raw_string_ostream(suffix) - << inputType.getElementType() << indexType.getElementType(); - MLIRContext *context = op->getContext(); - IREE::GPU::UKernelSpecAttr ukernelSpec = - getUKernelSpec(ukernelName, suffix, context, targetAttr); - if (!ukernelSpec) { +IREE::GPU::UKernelConfigAttr selectUKernel(Operation *op) { + IREE::GPU::UKernelConfigAttr ukernelConfig = getUKernelConfig(op); + if (!ukernelConfig) { return {}; } - auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op); - ArrayAttr sourceExecutableObjects = lookUpExecutableObjects(op); - IREE::HAL::ExecutableObjectAttr bitcodeObject = getUKernelBitcode( - context, execTarget, sourceExecutableObjects, ukernelSpec.getName()); - if (!bitcodeObject) { + if (failed(ensureUKernelBitcode(op, ukernelConfig))) { return {}; } - op->setAttr(executableObjectsAttrName, - ArrayAttr::get(context, bitcodeObject)); - return ukernelSpec; + return ukernelConfig; } } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h index 4ed251b36070..cb7fa2abac61 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h @@ -10,6 +10,6 @@ namespace mlir::iree_compiler { -IREE::GPU::UKernelSpecAttr selectUKernelForArgmax(linalg::GenericOp op); +IREE::GPU::UKernelConfigAttr selectUKernel(Operation *op); } // namespace mlir::iree_compiler