Skip to content

Commit

Permalink
[UKernel] Add ukernel to be compiled through peano (#1097)
Browse files Browse the repository at this point in the history
Adds a path to compile ukernels through peano. The compilation time is a
lot faster this way, potentially at a decrease in performance, but
potentially ukernels could be written in a way that enables good
performance via peano as well.
  • Loading branch information
jtuyls authored Feb 12, 2025
1 parent a61912b commit 08e7777
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 64 deletions.
16 changes: 15 additions & 1 deletion build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
lower_to_aie_pipeline="objectFifo",
name_suffix="",
use_chess=False,
use_chess_for_ukernel=True,
use_ukernel=False,
run_benchmark=False,
n_repeats=1,
Expand All @@ -59,6 +60,7 @@ def __init__(
self.lower_to_aie_pipeline = lower_to_aie_pipeline
self.name_suffix = name_suffix
self.use_chess = use_chess
self.use_chess_for_ukernel = use_chess_for_ukernel
self.use_ukernel = use_ukernel
self.run_benchmark = run_benchmark
self.n_repeats = n_repeats
Expand Down Expand Up @@ -104,6 +106,7 @@ def __init__(
tile_pipeline = test_params.tile_pipeline
lower_to_aie_pipeline = test_params.lower_to_aie_pipeline
use_chess = test_params.use_chess
use_chess_for_ukernel = test_params.use_chess_for_ukernel
use_ukernel = test_params.use_ukernel
run_benchmark = test_params.run_benchmark
n_repeats = test_params.n_repeats
Expand All @@ -128,8 +131,17 @@ def __init__(
self.labels.append("Peano")

if use_ukernel:
self.name += "_ukernel"
self.labels.append("UKernel")
if use_chess_for_ukernel:
self.name += "_ukernel_chess"
self.add_aie_compilation_flags(
[f"--iree-amd-aie-enable-chess-for-ukernel=1"]
)
else:
self.name += "_ukernel_peano"
self.add_aie_compilation_flags(
[f"--iree-amd-aie-enable-chess-for-ukernel=0"]
)

if run_benchmark:
self.name += "_benchmark"
Expand Down Expand Up @@ -1707,7 +1719,9 @@ def __init__(self):
test_params=TestParams(
use_ukernel=True,
use_chess=False,
use_chess_for_ukernel=False,
run_on_target=["npu4"],
tile_pipeline="pack-peel-4-level-tiling",
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ LogicalResult AIETargetBackend::serializeExecutable(
/*timing=*/options.aie2xclbinTiming,
/*tempDir=*/entryPointWorkDir.str().str(),
/*useChess=*/options.useChess,
/*useChessForUKernel=*/options.useChessForUKernel,
/*verbose=*/options.showInvokedCommands,
/*vitisDir=*/options.vitisInstallDir.empty()
? std::nullopt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ struct AMDAIEOptions {
// Use the chess compiler. The default is to use peano.
bool useChess{false};

// Use the chess compiler for ukernel. The default is to use chess.
bool useChessForUKernel{true};

// Additional flags to run peano's opt with (if peano is the backend compiler
// selected). These are mostly appended on the end of the default flags, but
// some flags may replace existing flags if they conflict.
Expand Down Expand Up @@ -127,6 +130,11 @@ struct AMDAIEOptions {
llvm::cl::cat(category),
llvm::cl::desc("Use the legacy chess compiler"));

binder.opt<bool>(
"iree-amd-aie-enable-chess-for-ukernel", useChessForUKernel,
llvm::cl::cat(category),
llvm::cl::desc("Use the chess compiler for compiling ukernels"));

binder.opt<std::string>(
"iree-amdaie-enable-ukernels", enableAMDAIEUkernels,
llvm::cl::cat(category),
Expand Down
156 changes: 105 additions & 51 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,23 @@ using Path = std::filesystem::path;
namespace mlir::iree_compiler::AMDAIE {
namespace detail {

FailureOr<std::vector<std::string>> flagStringToVector(
const std::string &flags) {
if (flags.empty()) return std::vector<std::string>{};
// Check that flags string is of the form "-flag1 -flag2".
// i.e. that it starts and ends with ".
if (flags.size() < 2 || flags.front() != '"' || flags.back() != '"') {
llvm::errs()
<< "additional peano opt flags must be of the form "
"\"-flag1 -flag2 ...\". Specifically it must start and end with \".";
return failure();
}
// Split the additional flags on whitespace, and then add to the default args.
std::istringstream iss(flags.substr(1, flags.size() - 2));
return std::vector<std::string>{std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>{}};
}

// Peano's `opt` program optimizes llvm-ir (.ll files). We run it with a system
// call. This functions constructs the flags to pass to `opt`. There are some
// default flags, most of which are copied from llvm-aie. See
Expand All @@ -70,11 +87,10 @@ namespace detail {
// clang-format on
//
// There are also additional flags which have been passed down from the user,
// `additionalPeanoOptFlags`. This function appends these user specific flags,
// `additionalFlags`. This function appends these user specific flags,
// and checks that they are valid. If they are not, it returns failure.
FailureOr<std::vector<std::string>> makePeanoOptArgs(
const std::string &filenameIrIn, const std::string &filenameIrOut,
const std::string &additionalPeanoOptFlags) {
const std::vector<std::string> &additionalFlags) {
std::vector<std::string> args{
// peano has no proper vectorization cost model for AIE
"-vectorize-loops=false",
Expand All @@ -98,33 +114,9 @@ FailureOr<std::vector<std::string>> makePeanoOptArgs(
"--inline-threshold=10",
// missing from libc
"--disable-builtin=memset",
// Source file, IR to optimize
"-S", filenameIrIn,
// Output file, optimized IR
"-o", filenameIrOut};

if (additionalPeanoOptFlags.empty()) return args;

// Check that additionalPeanoOptFlags is of the form "-flag1 -flag2".
// i.e. that it starts and ends with ".
if (additionalPeanoOptFlags.size() < 2 ||
additionalPeanoOptFlags.front() != '"' ||
additionalPeanoOptFlags.back() != '"') {
llvm::errs()
<< "additional peano opt flags must be of the form "
"\"-flag1 -flag2 ...\". Specifically it must start and end with \".";
return failure();
}
};

// TODO(newling) use string_view, shouldn't need to copy the string here.
std::string stripped =
additionalPeanoOptFlags.substr(1, additionalPeanoOptFlags.size() - 2);

// Split the additional flags on whitespace, and then add to the default args.
std::istringstream iss(stripped);
std::vector<std::string> additionalFlags{
std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>{}};
if (additionalFlags.empty()) return args;

// Return true if `flag` is an optimization level flag, like -O2.
auto isOptLevelFlag = [](const std::string &flag) {
Expand Down Expand Up @@ -202,10 +194,14 @@ static const std::string _CHESS_INTRINSIC_WRAPPER_CPP{
static const std::string _MM_NPU1_CC{
#include "mm_npu1.cc"
};
// This is a string that contains a mm kernel for npu4.
// This is a string that contains npu4 kernels for compilation by chess.
static const std::string _MM_NPU4_CC{
#include "mm_npu4.cc"
};
// This is a string that contains npu4 kernels for compilation by peano.
static const std::string _MM_NPU4_PEANO_CC{
#include "mm_npu4_peano.cc"
};

FailureOr<std::string> getTargetDir(const std::string &npuVersion) {
if (npuVersion == "npu1") return std::string{"target_aie_ml"};
Expand Down Expand Up @@ -509,6 +505,27 @@ LogicalResult runTool(
return success();
}

static LogicalResult assembleFileUsingPeano(
const std::string &inputFile, const std::string &outputFile,
const std::vector<std::string> &extraArgs, Path &_tempDir, Path &peanoDir,
const std::string &npuVersion, bool verbose) {
std::vector<std::string> args;
args.reserve(args.size() + std::distance(extraArgs.begin(), extraArgs.end()));
args.insert(args.end(), extraArgs.begin(), extraArgs.end());
// TODO(jornt): O0 fails with peano, so we use O1 for now.
args.emplace_back("-O1");
args.emplace_back("-c");
args.emplace_back(inputFile);
args.emplace_back("-o");
args.emplace_back(outputFile);
if (verbose) args.emplace_back("-v");
if (failed(runTool((peanoDir / "bin" / "clang").string(), args, verbose))) {
llvm::errs() << "Failed to assemble " << outputFile << ".o with peano";
return failure();
}
return success();
}

LogicalResult assembleFileUsingChess(const std::string &inputFile,
const std::string &outputFile,
const std::vector<std::string> &extraArgs,
Expand Down Expand Up @@ -560,10 +577,15 @@ static auto assembleStringUsingChess =
std::bind(assembleStringUsing, assembleFileUsingChess, _1, _2, _3, _4, _5,
_6, _7, _8, _9);

static auto assembleStringUsingPeano =
std::bind(assembleStringUsing, assembleFileUsingPeano, _1, _2, _3, _4, _5,
_6, _7, _8, _9);

// Generate the elf files for the core
LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
const std::string &objFile, Path &tempDir,
bool useChess, std::optional<Path> vitisDir,
bool useChess, bool useChessForUKernel,
std::optional<Path> vitisDir,
const std::string &targetArch, bool verbose,
Path peanoDir, const std::string &npuVersion,
const std::optional<std::string> &ukernel) {
Expand All @@ -578,7 +600,7 @@ LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
ukernelFileName = "mm_npu1.cc";
ukernelObjectName = "mm_npu1.o";
} else if (npuVersion == "npu4") {
ukernelFileContent = _MM_NPU4_CC;
ukernelFileContent = useChessForUKernel ? _MM_NPU4_CC : _MM_NPU4_PEANO_CC;
ukernelFileName = "mm_npu4.cc";
ukernelObjectName = "mm_npu4.o";
} else {
Expand Down Expand Up @@ -613,15 +635,30 @@ LogicalResult generateCoreElfFiles(AIE::DeviceOp deviceOp,
return failure();
}
if (!std::filesystem::exists(cwd / ukernelObjectName)) {
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs*/ std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
if (useChessForUKernel) {
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs=*/std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
} else {
std::string targetLower = StringRef(targetArch).lower();
std::vector<std::string> extraArgs{"--target=" + targetLower +
"-none-unknown-elf"};
mmObjectFilePath = assembleStringUsingPeano(
/*inputFileStr=*/ukernelFileContent,
/*inputFileName=*/ukernelFileName,
/*outputFileName=*/ukernelObjectName,
/*outputDir=*/cwd,
/*extraArgs=*/extraArgs,
/*workDir=*/tempDir,
/*vitisDir=*/peanoDir,
/*npuVersion*/ npuVersion, verbose);
}
if (failed(mmObjectFilePath)) return failure();
} else {
mmObjectFilePath = cwd / ukernelObjectName;
Expand Down Expand Up @@ -1113,15 +1150,30 @@ LogicalResult generateUnifiedObject(

std::string OptLLVMIRFile = (tempDir / "input.opt.ll").string();

FailureOr<std::vector<std::string>> peanoArgs =
FailureOr<std::vector<std::string>> maybeAdditionalPeanoArgs =
mlir::iree_compiler::AMDAIE::detail::flagStringToVector(
additionalPeanoOptFlags);
if (failed(maybeAdditionalPeanoArgs)) {
llvm::errs() << "Failed to parse additional peano args\n";
return failure();
}

FailureOr<std::vector<std::string>> maybePeanoArgs =
mlir::iree_compiler::AMDAIE::detail::makePeanoOptArgs(
LLVMIRFile, OptLLVMIRFile, additionalPeanoOptFlags);
if (failed(peanoArgs)) {
maybeAdditionalPeanoArgs.value());
if (failed(maybePeanoArgs)) {
llvm::errs() << "Failed to make peano opt args\n";
return failure();
}

if (failed(runTool(peanoOptBin.string(), peanoArgs.value(), verbose))) {
std::vector<std::string> peanoArgs = maybePeanoArgs.value();
// Source file, IR to optimize
peanoArgs.emplace_back("-S");
peanoArgs.emplace_back(LLVMIRFile);
// Output file, optimized IR
peanoArgs.emplace_back("-o");
peanoArgs.emplace_back(OptLLVMIRFile);

if (failed(runTool(peanoOptBin.string(), peanoArgs, verbose))) {
llvm::errs() << "Failed to optimize ll with peano\n";
llvm::errs() << "Using peano at provided path: '" << peanoDir.string()
<< "'\n";
Expand Down Expand Up @@ -1216,9 +1268,10 @@ LogicalResult aie2xclbin(
const std::optional<std::string> &outputNPU, bool emitCtrlPkt,
const std::string &artifactPath, bool printIRBeforeAll,
bool printIRAfterAll, bool printIRModuleScope, bool timing,
const std::string &tempDir, bool useChess, bool verbose,
const std::optional<std::string> &vitisDir, const std::string &targetArch,
const std::string &npuVersion, const std::string &peanoDir,
const std::string &tempDir, bool useChess, bool useChessForUKernel,
bool verbose, const std::optional<std::string> &vitisDir,
const std::string &targetArch, const std::string &npuVersion,
const std::string &peanoDir,
const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal,
const std::string &xclBinKernelID, const std::string &xclBinKernelName,
const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir,
Expand Down Expand Up @@ -1248,8 +1301,9 @@ LogicalResult aie2xclbin(
}

if (failed(generateCoreElfFiles(deviceOp, unifiedObj.string(), tempDirPath,
useChess, vitisDirPath, targetArch, verbose,
peanoDir, npuVersion, ukernel))) {
useChess, useChessForUKernel, vitisDirPath,
targetArch, verbose, peanoDir, npuVersion,
ukernel))) {
llvm::errs() << "Failed to generate core ELF file(s)\n";
return failure();
}
Expand Down
14 changes: 8 additions & 6 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ mlir::LogicalResult aie2xclbin(
const std::optional<std::string> &outputNPU, bool emitCtrlPkt,
const std::string &artifactPath, bool printIRBeforeAll,
bool printIRAfterAll, bool printIRModuleScope, bool timing,
const std::string &tempDir, bool useChess, bool verbose,
const std::optional<std::string> &vitisDir, const std::string &targetArch,
const std::string &npuVersion, const std::string &peanoDir,
const std::string &tempDir, bool useChess, bool useChessForUKernel,
bool verbose, const std::optional<std::string> &vitisDir,
const std::string &targetArch, const std::string &npuVersion,
const std::string &peanoDir,
const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal,
const std::string &xclBinKernelID, const std::string &xclBinKernelName,
const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir,
Expand All @@ -31,8 +32,9 @@ mlir::LogicalResult emitNpuInstructions(xilinx::AIE::DeviceOp deviceOp,
const std::string &outputNPU);

namespace detail {
FailureOr<std::vector<std::string>> flagStringToVector(
const std::string &flags);
FailureOr<std::vector<std::string>> makePeanoOptArgs(
const std::string &filenameIrIn, const std::string &filenameIrOut,
const std::string &additionalPeanoOptFlags);
}
const std::vector<std::string> &additionalPeanoOptFlags);
} // namespace detail
} // namespace mlir::iree_compiler::AMDAIE
Loading

0 comments on commit 08e7777

Please sign in to comment.