From 94999c00cb98d7dd0e06daf2101ade8c6f910cdc Mon Sep 17 00:00:00 2001 From: Andrey Shukshov <36711069+B1tway@users.noreply.github.com> Date: Wed, 16 Nov 2022 15:22:20 +0300 Subject: [PATCH] [ROCm] Introducing dump support for AMDGCN (#25) * add `amdgcn` target for tools/aot.py * clang-format fix * [ROCm] added AMDGPU kernel call conversion * [fix] Fixing AMDGPU calling convection --- CMakeLists.txt | 1 + .../triton/Target/AMDGCN/AMDGCNTranslation.h | 19 ++++ .../TritonGPUToLLVM/TritonGPUToLLVM.cpp | 3 +- lib/Target/AMDGCN/AMDGCNTranslation.cpp | 99 +++++++++++++++++++ lib/Target/AMDGCN/CMakeLists.txt | 12 +++ lib/Target/CMakeLists.txt | 1 + python/src/triton.cc | 18 ++++ python/triton/compiler.py | 2 + python/triton/tools/aot.py | 25 +++-- 9 files changed, 168 insertions(+), 12 deletions(-) create mode 100644 include/triton/Target/AMDGCN/AMDGCNTranslation.h create mode 100644 lib/Target/AMDGCN/AMDGCNTranslation.cpp create mode 100644 lib/Target/AMDGCN/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index fa4bdbf8f81f..5d89657d4e93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,6 +213,7 @@ target_link_libraries(triton TritonGPUTransforms TritonLLVMIR TritonPTX + TritonAMDGCN ${dialect_libs} ${conversion_libs} # optimizations diff --git a/include/triton/Target/AMDGCN/AMDGCNTranslation.h b/include/triton/Target/AMDGCN/AMDGCNTranslation.h new file mode 100644 index 000000000000..2e05810ef918 --- /dev/null +++ b/include/triton/Target/AMDGCN/AMDGCNTranslation.h @@ -0,0 +1,19 @@ +#ifndef TRITON_TARGET_AMDGCNTRANSLATION_H +#define TRITON_TARGET_AMDGCNTRANSLATION_H + +#include +#include + +namespace llvm { +class Module; +} // namespace llvm + +namespace triton { + +// Translate LLVM IR to AMDGCN code. +std::string translateLLVMIRToAMDGCN(llvm::Module &module, + const std::string &_proc); + +} // namespace triton + +#endif diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp index 6f0d8d531f64..76f256dc1f04 100644 --- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp @@ -255,11 +255,10 @@ struct FuncOpConversion : public FuncOpConversionBase { auto ctx = funcOp->getContext(); -#ifndef USE_ROCM // Set an attribute to indicate this function is a kernel entry. newFuncOp->setAttr(NVVMMetadataField::Kernel, rewriter.getIntegerAttr(type::u1Ty(ctx), 1)); - +#ifdef USE_ROCM // Set an attribute for maxntidx, it could be used in latter LLVM codegen // for `nvvm.annotation` metadata. newFuncOp->setAttr(NVVMMetadataField::MaxNTid, diff --git a/lib/Target/AMDGCN/AMDGCNTranslation.cpp b/lib/Target/AMDGCN/AMDGCNTranslation.cpp new file mode 100644 index 000000000000..f643a9ef1266 --- /dev/null +++ b/lib/Target/AMDGCN/AMDGCNTranslation.cpp @@ -0,0 +1,99 @@ +#include "triton/Target/AMDGCN/AMDGCNTranslation.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/ExecutionEngine/ExecutionEngine.h" +#include "mlir/ExecutionEngine/OptUtils.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" +#include "triton/Target/LLVMIR/LLVMIRTranslation.h" + +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Cloning.h" + +namespace triton { + +static void init_llvm() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); +} + +static std::string llir_to_amdgcn(llvm::Module *module, + const std::string &_proc) { + init_llvm(); + + llvm::SmallVector buffer; + std::string triple = "amdgcn-amd-amdhsa"; + std::string layout = ""; + std::string features = "+sramecc,-xnack"; + // verify and store llvm + llvm::legacy::PassManager pm; + pm.add(llvm::createVerifierPass()); + pm.run(*module); + // create machine + module->setTargetTriple(triple); + std::string error; + auto target = + llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetOptions opt; + + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + + llvm::TargetMachine *machine = target->createTargetMachine( + module->getTargetTriple(), _proc, features, opt, llvm::Reloc::PIC_, + llvm::None, llvm::CodeGenOpt::None); + + // set data layout + if (layout.empty()) + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + // emit machine code + for (llvm::Function &f : module->functions()) { + f.addFnAttr(llvm::Attribute::AlwaysInline); + } + + llvm::legacy::PassManager pass; + llvm::raw_svector_ostream stream(buffer); + + // emit + machine->addPassesToEmitFile(pass, stream, nullptr, + llvm::CodeGenFileType::CGFT_AssemblyFile); + pass.run(*module); + + std::string amdgcn(buffer.begin(), buffer.end()); + + return amdgcn; +} + +std::string translateLLVMIRToAMDGCN(llvm::Module &module, + const std::string &_proc) { + auto gcnCode = llir_to_amdgcn(&module, _proc); + return gcnCode; +} + +} // namespace triton \ No newline at end of file diff --git a/lib/Target/AMDGCN/CMakeLists.txt b/lib/Target/AMDGCN/CMakeLists.txt new file mode 100644 index 000000000000..3c2076e75bcb --- /dev/null +++ b/lib/Target/AMDGCN/CMakeLists.txt @@ -0,0 +1,12 @@ +add_mlir_translation_library(TritonAMDGCN + AMDGCNTranslation.cpp + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRIR + MLIRLLVMIR + MLIRSupport + MLIRTargetLLVMIRExport + ) diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt index 9b24f0ff225b..c893c378d46b 100644 --- a/lib/Target/CMakeLists.txt +++ b/lib/Target/CMakeLists.txt @@ -1,2 +1,3 @@ +add_subdirectory(AMDGCN) add_subdirectory(LLVMIR) add_subdirectory(PTX) diff --git a/python/src/triton.cc b/python/src/triton.cc index 7d57ddab4c8c..dc015e1ca0c6 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -18,6 +18,7 @@ #include "triton/Dialect/Triton/IR/Types.h" #include "triton/Dialect/Triton/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" +#include "triton/Target/AMDGCN/AMDGCNTranslation.h" #include "triton/Target/LLVMIR/LLVMIRTranslation.h" #include "triton/Target/PTX/PTXTranslation.h" #include "triton/tools/sys/getenv.hpp" @@ -1273,6 +1274,23 @@ void init_triton_translation(py::module &m) { }, ret::take_ownership); + m.def( + "translate_llvmir_to_amdgcn", + [](const std::string llvmIR, int gfx_number) -> std::string { + // create LLVM module from C++ + llvm::LLVMContext context; + std::unique_ptr buffer = + llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str()); + llvm::SMDiagnostic error; + std::unique_ptr module = + llvm::parseIR(buffer->getMemBufferRef(), error, context); + // translate module to AMDGCN + std::string target = "gfx" + std::to_string(gfx_number); + auto gcnCode = triton::translateLLVMIRToAMDGCN(*module, target); + return gcnCode; + }, + ret::take_ownership); + m.def("compile_ptx_to_cubin", [](const std::string &ptxCode, const std::string &ptxasPath, int capability) -> py::object { diff --git a/python/triton/compiler.py b/python/triton/compiler.py index 1118f77765f6..3f76ac42698e 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -890,6 +890,8 @@ def optimize_tritongpu_ir(mod, num_stages): def make_llvm_ir(mod): return _triton.translate_triton_gpu_to_llvmir(mod) +def make_amdgcn(mod: Any, gfx_number: int): + return _triton.translate_llvmir_to_amdgcn(mod, gfx_number) def make_ptx(mod: Any, compute_capability: int, ptx_version: int) -> Tuple[str, int]: ''' diff --git a/python/triton/tools/aot.py b/python/triton/tools/aot.py index 72df49d4c299..05bda9f59d41 100644 --- a/python/triton/tools/aot.py +++ b/python/triton/tools/aot.py @@ -6,7 +6,7 @@ if __name__ == '__main__': # valid source and target formats - VALID_FORMATS = ['triton-ir', 'triton-gpu-ir', 'llvm-ir', 'ptx'] + VALID_FORMATS = ['triton-ir', 'triton-gpu-ir', 'llvm-ir', 'ptx', 'amdgcn'] # set up the argument parser # TODO: conditional requirements @@ -16,7 +16,7 @@ help="Target format, one of: " + ', '.join(VALID_FORMATS)) parser.add_argument('--sm', type=int, help="Compute capability to compile for") parser.add_argument('--ptx-version', type=int, help="PTX version to compile for") - + parser.add_argument('--gfx', type=int, help="AMDGPU target to compile for") # parse the args args = parser.parse_args() @@ -50,12 +50,17 @@ print(module) exit(0) - if not args.sm: - raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation") - if not args.ptx_version: - raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation") - - # llvm-ir -> ptx - module = triton.compiler.make_ptx(module, compute_capability=args.sm, ptx_version=args.ptx_version) - assert args.target == 'ptx' + if args.target == 'ptx': + if not args.sm: + raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation") + if not args.ptx_version: + raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation") + # llvm-ir -> ptx + module = triton.compiler.make_ptx(module, compute_capability=args.sm, ptx_version=args.ptx_version) + + if args.target == 'amdgcn': + if not args.gfx: + raise argparse.ArgumentError(None, "Must specify --gfx for AMDGCN compilation") + # llvm-ir -> amdgcn + module = triton.compiler.make_amdgcn(module, args.gfx) print(module)