-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP] Replace nvvm.annotation usage with kernel calling conventions #122320
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-clang Author: Alex MacLean (AlexMaclean) ChangesSpecifying a kernel with the Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit Patch is 345.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122320.diff 33 Files Affected:
diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp
index 4577ea4c9c2b5e..c5040989a0e407 100644
--- a/clang/test/OpenMP/assumes_include_nvptx.cpp
+++ b/clang/test/OpenMP/assumes_include_nvptx.cpp
@@ -11,11 +11,11 @@
// TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
// CHECK: call i32 @__kmpc_target_init(
// CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]]
// CHECK: declare void @__kmpc_target_deinit(
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
// CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]]
// CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]]
diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index d573f1cd193d64..94ace20826db4d 100644
--- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -90,7 +90,7 @@ int foo(int n, double *ptr) {
ptr[0]++;
}
- // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
+ // TCHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
// TCHECK: [[DYN_PTR_ADDR:%.+]] = alloca ptr,
// TCHECK: [[PTR_ADDR:%.+]] = alloca ptr,
// TCHECK-NOT: alloca ptr,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8dbf2aa7e0a243..487f886f9bdbfd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6404,6 +6404,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
if (T.isAMDGCN())
OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+ else if (T.isNVPTX())
+ OutlinedFn->setCallingConv(CallingConv::PTX_Kernel);
}
}
@@ -9077,20 +9079,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
if (!Fn)
return;
- Module &M = *(Fn->getParent());
- LLVMContext &Ctx = M.getContext();
-
- // Get "nvvm.annotations" metadata node.
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
- Metadata *MDVals[] = {
- ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
- // Append metadata to nvvm.annotations.
- MD->addOperand(MDNode::get(Ctx, MDVals));
-
// Add a function attribute for the kernel.
- Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
+ Fn->addFnAttr("kernel");
if (T.isAMDGCN())
Fn->addFnAttr("uniform-work-group-size", "true");
Fn->addFnAttr(Attribute::MustProgress);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 67585e9c80ef4e..f495840c254d59 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,6 +19,7 @@
#include "llvm/Transforms/IPO/OpenMPOpt.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
@@ -36,6 +37,7 @@
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
@@ -5909,34 +5911,52 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
return Fn.hasFnAttribute("kernel");
}
+static bool isKernelCC(Function &F) {
+ switch (F.getCallingConv()) {
+ default:
+ return false;
+ case CallingConv::PTX_Kernel:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return true;
+ }
+}
+
KernelSet llvm::omp::getDeviceKernels(Module &M) {
// TODO: Create a more cross-platform way of determining device kernels.
- NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
KernelSet Kernels;
- if (!MD)
- return Kernels;
-
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() < 2)
- continue;
- MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
- if (!KindID || KindID->getString() != "kernel")
- continue;
-
- Function *KernelFn =
- mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
- if (!KernelFn)
- continue;
+ DenseSet<const Function *> SeenKernels;
+ auto ProcessKernel = [&](Function &KF) {
+ if (SeenKernels.contains(&KF))
+ return;
+ SeenKernels.insert(&KF);
// We are only interested in OpenMP target regions. Others, such as kernels
// generated by CUDA but linked together, are not interesting to this pass.
- if (isOpenMPKernel(*KernelFn)) {
+ if (isOpenMPKernel(KF)) {
++NumOpenMPTargetRegionKernels;
- Kernels.insert(KernelFn);
+ Kernels.insert(&KF);
} else
++NumNonOpenMPTargetRegionKernels;
- }
+ };
+
+ if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"))
+ for (auto *Op : MD->operands()) {
+ if (Op->getNumOperands() < 2)
+ continue;
+ MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+ if (!KindID || KindID->getString() != "kernel")
+ continue;
+
+ if (auto *KernelFn =
+ mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)))
+ ProcessKernel(*KernelFn);
+ }
+
+ for (Function &F : M)
+ if (isKernelCC(F))
+ ProcessKernel(F);
return Kernels;
}
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 6028ff5278037b..9c5b19f7a6c88c 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -17,7 +17,7 @@
; CHECK: @G = external global i8
; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
;.
-define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
; CHECK: Function Attrs: norecurse nounwind
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
; CHECK-NEXT: entry:
@@ -79,12 +79,10 @@ attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-wi
attributes #2 = { convergent }
!omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
!llvm.module.flags = !{!2, !3, !4, !5, !6}
!llvm.ident = !{!7}
!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"openmp", i32 50}
!4 = !{i32 7, !"openmp-device", i32 50}
@@ -97,11 +95,10 @@ attributes #2 = { convergent }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
;.
diff --git a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
index 9c0416af359d4d..3f4790ee15ac8d 100644
--- a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
+++ b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
@@ -13,10 +13,6 @@ define linkonce_odr hidden i8 @_ZStplIdESt7complexIT_ERKS2_S4_() local_unnamed_a
ret i8 undef
}
-declare void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
+declare ptx_kernel void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
declare dso_local fastcc void @__kmpc_for_static_init_8u() unnamed_addr
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 47a5d5104aa8bd..5b7544b1a79616 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -28,7 +28,7 @@ declare void @llvm.assume(i1)
; CHECK: @G1 = global i32 42
; CHECK: @G2 = addrspace(1) global i32 0
;.
-define void @pos_empty_1(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_1(i1 %c) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@pos_empty_1
; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
; MODULE-NEXT: ret void
@@ -45,7 +45,7 @@ define void @pos_empty_1(i1 %c) "kernel" {
call void @llvm.assume(i1 %c)
ret void
}
-define void @pos_empty_2() "kernel" {
+define amdgpu_kernel void @pos_empty_2() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_2
; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: ret void
@@ -53,7 +53,7 @@ define void @pos_empty_2() "kernel" {
call void @aligned_barrier()
ret void
}
-define void @pos_empty_3() "kernel" {
+define amdgpu_kernel void @pos_empty_3() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_3
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -61,7 +61,7 @@ define void @pos_empty_3() "kernel" {
call void @llvm.nvvm.barrier0()
ret void
}
-define void @pos_empty_4() "kernel" {
+define amdgpu_kernel void @pos_empty_4() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_4
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -69,7 +69,7 @@ define void @pos_empty_4() "kernel" {
call i32 @llvm.nvvm.barrier0.and(i32 0)
ret void
}
-define void @pos_empty_5() "kernel" {
+define amdgpu_kernel void @pos_empty_5() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_5
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -77,7 +77,7 @@ define void @pos_empty_5() "kernel" {
call i32 @llvm.nvvm.barrier0.or(i32 0)
ret void
}
-define void @pos_empty_6() "kernel" {
+define amdgpu_kernel void @pos_empty_6() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_6
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -85,7 +85,7 @@ define void @pos_empty_6() "kernel" {
call i32 @llvm.nvvm.barrier0.popc(i32 0)
ret void
}
-define void @pos_empty_7a() "kernel" {
+define amdgpu_kernel void @pos_empty_7a() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -96,7 +96,7 @@ define void @pos_empty_7a() "kernel" {
ret void
}
; FIXME: We should remove the barrier.
-define void @pos_empty_7b() "kernel" {
+define amdgpu_kernel void @pos_empty_7b() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown() #[[ATTR5:[0-9]+]]
@@ -109,7 +109,7 @@ define void @pos_empty_7b() "kernel" {
call void @unknown()
ret void
}
-define void @pos_empty_8(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_8(i1 %c) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_8
; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -126,7 +126,7 @@ t:
f:
ret void
}
-define void @neg_empty_8() "kernel" {
+define amdgpu_kernel void @neg_empty_8() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_empty_8
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -137,7 +137,7 @@ define void @neg_empty_8() "kernel" {
call void @llvm.amdgcn.s.barrier()
ret void
}
-define void @neg_empty_9(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_9(i1 %c) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_empty_9
; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -173,7 +173,7 @@ m:
ret void
}
; FIXME: We should remove the barrier
-define void @pos_empty_10() "kernel" {
+define amdgpu_kernel void @pos_empty_10() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_10
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: br label [[M:%.*]]
@@ -186,7 +186,7 @@ m:
call void @llvm.amdgcn.s.barrier()
ret void
}
-define void @pos_empty_11() "kernel" {
+define amdgpu_kernel void @pos_empty_11() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_empty_11
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: br label [[M:%.*]]
@@ -206,7 +206,7 @@ define void @empty() {
ret void
}
; FIXME: We should remove the barrier in the end but not the first one.
-define void @neg_empty_12(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_12(i1 %c) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@neg_empty_12
; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
; MODULE-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -266,7 +266,7 @@ define void @neg_empty_2() "kernel" {
@GC1 = constant i32 42
@GC2 = addrspace(4) global i32 0
@GPtr4 = addrspace(4) global ptr addrspace(4) null
-define void @pos_constant_loads() "kernel" {
+define amdgpu_kernel void @pos_constant_loads() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_constant_loads
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(4), ptr addrspace(4) @GPtr4, align 8
@@ -296,7 +296,7 @@ define void @pos_constant_loads() "kernel" {
@GS = addrspace(3) global i32 0
@GPtr = global ptr null
; TODO: We could remove some of the barriers due to the lack of write effects.
-define void @neg_loads() "kernel" {
+define amdgpu_kernel void @neg_loads() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_loads
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -327,7 +327,7 @@ define void @neg_loads() "kernel" {
@PG1 = thread_local global i32 42
@PG2 = addrspace(5) global i32 0
@GPtr5 = global ptr addrspace(5) null
-define void @pos_priv_mem() "kernel" {
+define amdgpu_kernel void @pos_priv_mem() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
@@ -358,7 +358,7 @@ define void @pos_priv_mem() "kernel" {
}
@G1 = global i32 42
@G2 = addrspace(1) global i32 0
-define void @neg_mem() "kernel" {
+define amdgpu_kernel void @neg_mem() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@neg_mem
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -388,7 +388,7 @@ define void @neg_mem() "kernel" {
ret void
}
-define void @pos_multiple() "kernel" {
+define amdgpu_kernel void @pos_multiple() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@pos_multiple
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: ret void
@@ -404,7 +404,7 @@ define void @pos_multiple() "kernel" {
ret void
}
-define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1
; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
@@ -461,7 +461,7 @@ m:
ret void
}
-define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2
; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
; CHECK-NEXT: store i32 4, ptr [[P]], align 4
@@ -727,7 +727,7 @@ define internal void @barrier_then_write_then_barrier0(ptr %p) {
call void @aligned_barrier()
ret void
}
-define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0
; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
; MODULE-NEXT: call void @barrier_then_write_then_barrier0(ptr [[P]])
@@ -1040,7 +1040,7 @@ define internal void @callee_barrier() {
call void @aligned_barrier()
ret void
}
-define void @caller_barrier1() "kernel" {
+define amdgpu_kernel void @caller_barrier1() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@caller_barrier1
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @callee_barrier()
@@ -1051,7 +1051,7 @@ define void @caller_barrier1() "kernel" {
call void @aligned_barrier()
ret void
}
-define void @caller_barrier2() "kernel" {
+define amdgpu_kernel void @caller_barrier2() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@caller_barrier2
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: call void @unknown()
@@ -1065,7 +1065,7 @@ define void @caller_barrier2() "kernel" {
ret void
}
-define void @loop_barrier() "kernel" {
+define amdgpu_kernel void @loop_barrier() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1095,7 +1095,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1129,7 +1129,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers_unknown() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_unknown() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_unknown
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1165,7 +1165,7 @@ exit:
ret void
}
-define void @loop_barrier_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_store() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_store
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1195,7 +1195,7 @@ exit:
ret void
}
-define void @loop_barrier_end_barriers_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_store() "kernel" {
; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_store
; CHECK-SAME: () #[[ATTR4]] {
; CHECK-NEXT: entry:
@@ -1232,37 +1232,7 @@ exit:
}
!llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26,!27,!28,!29,!30}
-!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
-!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
-!2 = !{ptr @pos_empty_3, !"kernel", i32 1}
-!3 = !{ptr @pos_empty_4, !"kernel", i32 1}
-!4 = !{ptr @pos_empty_5, !"kernel", i32 1}
-!5 = !{ptr @pos_empty_6, !"kernel", i32 1}
-!17 = !{ptr @pos_empty_7a, !"kernel", i32 1}
-!18 = !{ptr @pos_empty_7b, !"kernel", i32 1}
-!23 = !{ptr @pos_empty_8, !"kernel", i32 1}
-!24 = !{ptr @caller_barrier1, !"kernel", i32 1}
-!25 = !{ptr @caller_barrier2, !"kernel", i32 1}
-!26 = !{ptr @loop_barrier, !"kernel", i32 1}
-!27 = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-!28 = !{ptr @loop_barrier_end_barriers_unkno...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Happy to see we are moving away from annotation. :-)
if (SeenKernels.contains(&KF)) | ||
return; | ||
SeenKernels.insert(&KF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Double map lookup, do the insert and see if it succeeded
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice, fixed. Once #119261 lands we'll be able to simply iterate over the functions and this won't even be needed.
@jdoerfert / @arsenm ping for review when you have a moment |
0b035b1
to
9f0b1ae
Compare
This reverts commit bb9d5c2. This will facilitate merging main due to 07ed818 (PR llvm#122320), which changes llvm::omp::getDeviceKernels. Will rewrite and reapply after merging main.
Specifying a kernel with the
ptx_kernel
oramdgpu_kernel
calling convention is a more idiomatic and compile-time performant than using thenvvm.annoation !"kernel"
metadata.Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit
nvvm.annoation
. Update OpenMPOpt to work with kernels specified via calling convention as well as metadata. Update OpenMP tests to use the calling conventions.This change is a prerequisite for #119261