[CIR][CUDA] Generate attribute for kernel name of device stubs

llvm · Feb 7, 2025 · 1ce7b1f · 1ce7b1f
1 parent a07dbdf
commit 1ce7b1f
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 12 deletions.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -1323,6 +1323,23 @@ def GlobalAnnotationValuesAttr : CIR_Attr<"GlobalAnnotationValues",
   let genVerifyDecl = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// CUDAKernelNameAttr
+//===----------------------------------------------------------------------===//
+
+def CUDAKernelNameAttr : CIR_Attr<"CUDAKernelName",
+                                  "cuda_kernel_name"> {
+  let summary = "Device-side function name for this stub.";
+  let description = [{
+    This attribute is attached to function definitions and records the mangled name of the kernel function used on the device.
+
+    In CUDA, global functions (kernels) are processed differently for host and device. On host, Clang generates device stubs; on device, they are treated as normal functions. As they probably have different mangled names, we must record the corresponding device-side name for a stub.
+  }];
+
+  let parameters = (ins "std::string":$kernel_name);
+  let assemblyFormat = "`<` $kernel_name `>`";
+}
+
 def CIR_TBAAAttr : CIR_Attr<"TBAA", "tbaa", []> {
 }
 

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -460,6 +460,16 @@ void CIRGenModule::constructAttributeList(
         getLangOpts().OffloadUniformBlock)
       assert(!cir::MissingFeatures::CUDA());
 
+    if (langOpts.CUDA && !langOpts.CUDAIsDevice &&
+        TargetDecl->hasAttr<CUDAGlobalAttr>()) {
+      GlobalDecl kernel(CalleeInfo.getCalleeDecl());
+      llvm::StringRef kernelName = getMangledName(
+          kernel.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+      auto attr =
+          cir::CUDAKernelNameAttr::get(&getMLIRContext(), kernelName.str());
+      funcAttrs.set(attr.getMnemonic(), attr);
+    }
+
     if (TargetDecl->hasAttr<ArmLocallyStreamingAttr>())
       ;
   }

diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -9,7 +9,6 @@
 // This is the internal per-translation-unit state used for CIR translation.
 //
 //===----------------------------------------------------------------------===//
-#include "CIRGenModule.h"
 #include "CIRGenCXXABI.h"
 #include "CIRGenCstEmitter.h"
 #include "CIRGenFunction.h"
@@ -527,10 +526,9 @@ void CIRGenModule::emitGlobal(GlobalDecl GD) {
       if (langOpts.HIPStdPar)
         llvm_unreachable("NYI");
 
-      if (Global->hasAttr<CUDAGlobalAttr>())
-        llvm_unreachable("NYI");
-
-      if (!Global->hasAttr<CUDADeviceAttr>())
+      // Global functions reside on device, so it shouldn't be skipped.
+      if (!Global->hasAttr<CUDAGlobalAttr>() &&
+          !Global->hasAttr<CUDADeviceAttr>())
         return;
     } else {
       // We must skip __device__ functions when compiling for host.
@@ -2351,10 +2349,10 @@ cir::FuncOp CIRGenModule::GetAddrOfFunction(clang::GlobalDecl GD, mlir::Type Ty,
   auto F = GetOrCreateCIRFunction(MangledName, Ty, GD, ForVTable, DontDefer,
                                   /*IsThunk=*/false, IsForDefinition);
 
-  // As __global__ functions always reside on device,
-  // we need special care when accessing them from host;
-  // otherwise, CUDA functions behave as normal functions
-  if (langOpts.CUDA && !langOpts.CUDAIsDevice &&
+  // As __global__ functions (kernels) always reside on device,
+  // when we access them from host, we must refer to the kernel handle.
+  // For CUDA, it's just the device stub. For HIP, it's something different.
+  if (langOpts.CUDA && !langOpts.CUDAIsDevice && langOpts.HIP &&
       cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
     llvm_unreachable("NYI");
   }
@@ -2397,7 +2395,7 @@ static std::string getMangledNameImpl(CIRGenModule &CGM, GlobalDecl GD,
       assert(0 && "NYI");
     } else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
                GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
-      assert(0 && "NYI");
+      Out << "__device_stub__";
     } else {
       Out << II->getName();
     }

diff --git a/clang/test/CIR/CodeGen/CUDA/simple-device.cu b/clang/test/CIR/CodeGen/CUDA/simple-device.cu
@@ -12,3 +12,7 @@ __host__ void host_fn(int *a, int *b, int *c) {}
 __device__ void device_fn(int* a, double b, float c) {}
 
 // CIR: cir.func @_Z9device_fnPidf
+
+// Global functions are also like normal functions.
+__global__ void global_fn(int *a, int *b, int *c) {}
+// CIR: cir.func @_Z9global_fnPiS_S_
diff --git a/clang/test/CIR/CodeGen/CUDA/simple.cu b/clang/test/CIR/CodeGen/CUDA/simple.cu
@@ -1,9 +1,11 @@
 #include "../Inputs/cuda.h"
 
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
-// RUN:            -emit-cir %s -o %t.cir
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
+// RUN:            -x cuda -emit-cir %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
 
+// Attribute for global_fn
+// CIR: #cir.cuda_kernel_name<_Z9global_fnv>
 
 // This should emit as a normal C++ function.
 __host__ void host_fn(int *a, int *b, int *c) {}
@@ -14,3 +16,11 @@ __host__ void host_fn(int *a, int *b, int *c) {}
 __device__ void device_fn(int* a, double b, float c) {}
 
 // CHECK-NOT: cir.func @_Z9device_fnPidf
+
+// This should emit a device stub,
+// with an attribute showing the mangled name on device.
+__global__ void global_fn();
+// CIR: @_Z24__device_stub__global_fnv
+
+// Make sure `global_fn` indeed gets emitted
+void x() { auto v = global_fn; }