intel · sarnex · Jul 15, 2024 · Jul 15, 2024 · Sep 18, 2024 · Sep 18, 2024
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  BitReader
   BitWriter
   Core
   BinaryFormat

@@ -17,9 +17,9 @@
 
 #if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__)
 #ifdef __SYCL_DEVICE_ONLY__
-#define DEVICE_EXTERNAL SYCL_EXTERNAL __attribute__((weak))
+#define DEVICE_EXTERNAL SYCL_EXTERNAL
 #else // __SYCL_DEVICE_ONLY__
-#define DEVICE_EXTERNAL __attribute__((weak))
+#define DEVICE_EXTERNAL
 #endif // __SYCL_DEVICE_ONLY__
 
 #define DEVICE_EXTERN_C DEVICE_EXTERNAL EXTERN_C

@@ -114,9 +114,4 @@ DEVICE_EXTERN_C void __devicelib_assert_fail(const char *expr, const char *file,
   __assertfail(expr, file, line, func, 1);
 }
 
-DEVICE_EXTERN_C void _wassert(const char *_Message, const char *_File,
-                              unsigned _Line) {
-  __assertfail(_Message, _File, _Line, 0, 1);
-}
-
 #endif
@@ -321,6 +321,7 @@ splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings);
 
 bool isESIMDFunction(const Function &F);
 bool canBeImportedFunction(const Function &F);
+bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints);
 
 } // namespace module_split
 

@@ -0,0 +1,22 @@
+//===-- SYCLLinkedModuleProcessor.h - finalize a fully linked module ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The file contains a number of functions to create a pass that can be called
+// by the LTO backend that will finalize a fully-linked module.
+//===----------------------------------------------------------------------===//
+#pragma once
+#include "SpecConstants.h"
+namespace llvm {
+
+class PassRegistry;
+class ModulePass;
+ModulePass *
+    createSYCLLinkedModuleProcessorPass(llvm::SpecConstantsPass::HandlingMode);
+void initializeSYCLLinkedModuleProcessorPass(PassRegistry &);
+
+} // namespace llvm
@@ -1077,8 +1077,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
       if (BM.getModuleIdentifier().contains(Name)) {
         ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
-        llvm::errs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
-                     << " to compile\n";
+        LLVM_DEBUG(dbgs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
+                          << " to compile\n");
       }
     }
   }

@@ -65,6 +65,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   SYCLDeviceRequirements.cpp
   SYCLKernelParamOptInfo.cpp
   SYCLJointMatrixTransform.cpp
+  SYCLLinkedModuleProcessor.cpp
   SYCLPropagateAspectsUsage.cpp
   SYCLPropagateJointMatrixUsage.cpp
   SYCLVirtualFunctionsAnalysis.cpp

@@ -117,32 +117,6 @@ bool isKernel(const Function &F) {
          F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
 }
 
-bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) {
-  // Skip declarations, if any: they should not be included into a vector of
-  // entry points groups or otherwise we will end up with incorrectly generated
-  // list of symbols.
-  if (F.isDeclaration())
-    return false;
-
-  // Kernels are always considered to be entry points
-  if (isKernel(F))
-    return true;
-
-  if (!EmitOnlyKernelsAsEntryPoints) {
-    // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute
-    // are also considered as entry points (except __spirv_* and __sycl_*
-    // functions)
-    return llvm::sycl::utils::isSYCLExternalFunction(&F) &&
-           !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) &&
-           !isGenericBuiltin(F.getName());
-  }
-
-  // Even if we are emitting only kernels as entry points, virtual functions
-  // should still be treated as entry points, because they are going to be
-  // outlined into separate device images and linked in later.
-  return F.hasFnAttribute("indirectly-callable");
-}
-
 // Represents "dependency" or "use" graph of global objects (functions and
 // global variables) in a module. It is used during device code split to
 // understand which global variables and functions (other than entry points)
@@ -445,6 +419,32 @@ class ModuleSplitter : public ModuleSplitterBase {
 namespace llvm {
 namespace module_split {
 
+bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) {
+  // Skip declarations, if any: they should not be included into a vector of
+  // entry points groups or otherwise we will end up with incorrectly generated
+  // list of symbols.
+  if (F.isDeclaration())
+    return false;
+
+  // Kernels are always considered to be entry points
+  if (isKernel(F))
+    return true;
+
+  if (!EmitOnlyKernelsAsEntryPoints) {
+    // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute
+    // are also considered as entry points (except __spirv_* and __sycl_*
+    // functions)
+    return llvm::sycl::utils::isSYCLExternalFunction(&F) &&
+           !isSpirvSyclBuiltin(F.getName()) && !isESIMDBuiltin(F.getName()) &&
+           !isGenericBuiltin(F.getName());
+  }
+
+  // Even if we are emitting only kernels as entry points, virtual functions
+  // should still be treated as entry points, because they are going to be
+  // outlined into separate device images and linked in later.
+  return F.hasFnAttribute("indirectly-callable");
+}
+
 std::optional<IRSplitMode> convertStringToSplitMode(StringRef S) {
   static const StringMap<IRSplitMode> Values = {{"kernel", SPLIT_PER_KERNEL},
                                                 {"source", SPLIT_PER_TU},

@@ -0,0 +1,45 @@
+//===-- SYCLLinkedModuleProcessor.cpp - finalize a fully linked module ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// See comments in the header.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/SYCLLinkedModuleProcessor.h"
+
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "sycl-linked-module-processor"
+using namespace llvm;
+
+namespace {
+class SYCLLinkedModuleProcessor : public ModulePass {
+public:
+  static char ID;
+  SYCLLinkedModuleProcessor(SpecConstantsPass::HandlingMode Mode)
+      : ModulePass(ID), Mode(Mode) {
+    initializeSYCLLinkedModuleProcessorPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    // TODO: determine if we need to run other passes
+    ModuleAnalysisManager MAM;
+    SpecConstantsPass SCP(Mode);
+    auto PA = SCP.run(M, MAM);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  SpecConstantsPass::HandlingMode Mode;
+};
+} // namespace
+char SYCLLinkedModuleProcessor::ID = 0;
+INITIALIZE_PASS(SYCLLinkedModuleProcessor, "SYCLLinkedModuleProcessor",
+                "Finalize a fully linked SYCL module", false, false)
+ModulePass *llvm::createSYCLLinkedModuleProcessorPass(
+    SpecConstantsPass::HandlingMode Mode) {
+  return new SYCLLinkedModuleProcessor(Mode);
+}
@@ -550,6 +550,9 @@ unit)
 - `off` - disables device code split. If `-fno-sycl-rdc` is specified, the behavior is
    the same as `per_source`
 
+If ThinLTO is enabled, device code splitting is run during the compilation stage.
+See [here](ThinLTO.md) for more information.
+
 ##### Symbol table generation
 
 TBD

@@ -0,0 +1,147 @@
+# ThinLTO for SYCL
+
+This document describes the purpose and design of ThinLTO for SYCL.
+
+**NOTE**: This is not the final version. The document is still in progress.
+
+## Background
+
+With traditional SYCL device code linking, all user code is linked together 
+along with device libraries into a single huge module and then split and 
+processed by `sycl-post-link`. This requires sequential processing, has a large 
+memory footprint, and differs from the linking flow for AMD and NVIDIA devices.
+
+## Summary
+SYCL ThinLTO will hook into the existing community mechanism to run LTO as part 
+of device linking inside `clang-linker-wrapper`. We split the device images 
+early at compilation time, and at link time we use ThinLTO's function importing 
+feature
+to bring in the defintions for referenced functions. Only the new offload model
+is supported.
+
+## Device code compilation time changes
+Most of the changes for ThinLTO occur during device link time, however there is 
+one major change during compilation (-c) time: we now run device code split 
+during compilaton instead of linking.
+The main reason for doing this is increased parallelization. Many compilation 
+jobs can be run at the same time, but linking happens once total for the 
+application. Device code split is currently a common source of performance 
+issues.
+
+Splitting early means that the resulting IR after splitting is not complete, it 
+still may contain calls to functions (user code and/or the SYCL device 
+libraries) from other object files.
+
+We rely on the assumption that all function defintions matching a declaration 
+will be the same and we can let ThinLTO pull in any one.
+
+For example, let's start with user device code that defines a `SYCL_EXTERNAL` 
+function `foo` in translation unit `tu_foo`. There is also another translation 
+unit `tu_bar` that references `foo`.
+During the early device code splitting run of `tu_foo`, we may find that more 
+than one of the resultant device images contain a defintion for `foo`.
+
+We assert that any function defintion for `foo` that is deemed a match by the 
+ThinLTO infrastruction during the processing of `tu_bar` is valid.
+
+As a result of running early device code split, the fat object file generated 
+as part of device compilation may contain multiple device code images.
+
+# Device code link time changes
+
+Before we go into the link time changes for SYCL, let's understand the device 
+linking flow for community devices (AMD/NVIDIA):
+
+![Community linking flow](images/ThinLTOCommunityFlow.svg)
+
+SYCL has two differenting requirements:
+1) The SPIR-V backend is not production ready and the SPIR-V translator is used.
+2) The SYCL runtime requires metadata (module properties and module symbol 
+table) computed from device images that will be stored along the device images 
+in the fat executable.
+
+The effect of requirement 1) is that instead of letting ThinLTO call the SPIR-V 
+backend, we add a callback that runs right before codegen would run.
+In that callback, we call the SPIR-V translator and store the resultant file 
+path for use later, and we instruct the ThinLTO framework to not
+perform codegen.
+
+An interesting additional fact about requirement 2) is that we actually need to 
+process fully linked module to accurate compute the module properties. One 
+example where we need the full module is to [compute the required devicelib mask](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp).
+If we only process the device code that was included in the 
+original fat object input to `clang-linker-wrapper`, we will miss devicelib 
+calls in referenced `SYCL_EXTERNAL` functions.
+
+The effect of requirement 2) is that we store the fully linked device image for 
+metadata computation in the SYCL-specific handing code after the ThinLTO 
+framework has completed. Another option would be to try to compute the metadata 
+inside the ThinLTO framework callbacks, but this would require SYCL-specific 
+arguments to many caller functions in the stack and pollute community code.
+
+Here is the current ThinLTO flow for SYCL:
+
+![SYCL linking flow](images/ThinLTOSYCLFlow.svg)
+
+We add a `PreCodeGenModuleHook` function to the `LTOConfig` object so that we 
+can process the fully linked module without running the backend.
+
+However, the flow is not ideal for many reasons:
+1) We are relying on the external `llvm-spirv` tool instead of the SPIR-V 
+backend. We could slightly improve this issue by using a library call to the 
+SPIR-V translator instead of the tool, however the library API requires setting 
+up an object to represent the arguments while we only have strings, and it's 
+non-trivial to parse the strings to figure out how to create the argument 
+object. Since we plan to use the SPIR-V backend in the long term, this does not 
+seem to be worth the effort.
+
+2) We manually run passes inside `PreCodeGenModuleHook`. This is because we 
+don't run codegen, so we can't take advantage of the `PreCodeGenPassesHook` 
+field of `LTOConfig` to run some custom passes, as those passes are only run 
+when we actually are going to run codegen.
+
+3) We have to store the fully linked module. This is needed because we need a 
+fully linked module to accurately compute metadata, see the above explanation 
+of SYCL requirement 2). We could get around storing the module by computing the 
+metadata inside the LTO framework and storing it for late use by the SYCL 
+bundling code, but doing this would require even more SYCL-only customizations including 
+even more new function arguments and modifications of the `OffloadFile` class. 
+There are also compliations because the LTO framework is multithreaded, and not all 
+LLVM data structures are thread safe.
+
+The proposed long-term SYCL ThinLTO flow is as follows:
+
+![SYCL SPIR-V backend linking flow](images/ThinLTOSYCLSPIRVBackendFlow.svg)
+
+The biggest difference here is that we are running codegen using the SPIR-V 
+backend.
+
+Also, instead of using a lambda function in the `PreCodeGenModuleHook` 
+callback to run SYCL finalization passes, we can take advantage of the `PreCodeGenPassesHook` field to add 
+passes to the pass manager that the LTO framework will run.
+
+It is possible that the number of device images in the fat executable
+and which device image contains which kernel is different with ThinLTO
+enabled, but we do expect this to have any impact on correctness or
+performance, nor we do expect users to care.
+
+
+# Current limitations
+
+`-O0`: Compiling with `-O0` prevent clang from generating ThinLTO metadata 
+during the compilation phase. In the current implementation, this is an error. 
+In the final version, we could either silently fall back to full LTO or 
+generate ThinLTO metadata even for `-O0`.
+
+SYCL libdevice: Current all `libdevice` functions are explicitly marked to be 
+weak symbols. The ThinLTO framework does not consider a defintion of function 
+with weak linkage as it cannot be sure that this definiton is the correct one. 
+Ideally we could remove the weak symbol annotation.
+
+No binary linkage: The SPIR-V target does not currently have a production 
+quality binary linker. This means that we must generate a fully linked image as 
+part of device linkage. At least for AMD devices, this is not a requirement as 
+`lld` is used for the final link which can resolve any unresolved symbols. 
+`-fno-gpu-rdc` is default for AMD, so in that case it can call `lld` during 
+compile, but if `-fno-gpu-rdc` is passed, the lld call happens as part of 
+`clang-linker-wrapper` to resolve any symbols not resolved by ThinLTO.