Skip to content

Commit

Permalink
Automerge: [AMDGPU][True16][CodeGen] true16 codegen pattern for fma (…
Browse files Browse the repository at this point in the history
…#127240)

Previous PR llvm/llvm-project#122950 get
reverted since it hit the buildbot failure. Another patch get merged
when this PR is under review, and thus causing one test not up to date.

repen this PR and fixed the issue.
  • Loading branch information
broxigarchen authored and github-actions[bot] committed Feb 19, 2025
2 parents 9a005b8 + 210036a commit 80cca23
Show file tree
Hide file tree
Showing 11 changed files with 913 additions and 256 deletions.
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F16_t16_e64:
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
Expand Down
125 changes: 89 additions & 36 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3461,6 +3461,62 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
llvm_unreachable("covered subregister switch");
}

static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
switch (Opc) {
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_MAD_F16_e64:
return AMDGPU::V_MADAK_F16;
case AMDGPU::V_MAC_F32_e32:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAD_F32_e64:
return AMDGPU::V_MADAK_F32;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMA_F32_e64:
return AMDGPU::V_FMAAK_F32;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMA_F16_e64:
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAAK_F16_t16
: AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
default:
llvm_unreachable("invalid instruction");
}
}

static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
switch (Opc) {
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_MAD_F16_e64:
return AMDGPU::V_MADMK_F16;
case AMDGPU::V_MAC_F32_e32:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAD_F32_e64:
return AMDGPU::V_MADMK_F32;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMA_F32_e64:
return AMDGPU::V_FMAMK_F32;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMA_F16_e64:
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAMK_F16_t16
: AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
default:
llvm_unreachable("invalid instruction");
}
}

bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Register Reg, MachineRegisterInfo *MRI) const {
if (!MRI->hasOneNonDBGUse(Reg))
Expand Down Expand Up @@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
Expand All @@ -3555,6 +3612,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
Expand Down Expand Up @@ -3586,18 +3644,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
!isInlineConstant(Def->getOperand(1)))
return false;

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
unsigned NewOpc = getNewFMAMKInst(ST, Opc);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
// takes VGPR_32_Lo128 operands, so the rewrite would also require
// restricting their register classes. For now just bail out.
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;

const std::optional<int64_t> SubRegImm = extractSubregFromImm(
Expand All @@ -3613,7 +3668,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Expand Down Expand Up @@ -3666,25 +3721,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
}

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
unsigned NewOpc = getNewFMAAKInst(ST, Opc);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
// V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
// takes VGPR_32_Lo128 operands, so the rewrite would also require
// restricting their register classes. For now just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;

// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Expand Down Expand Up @@ -3874,8 +3926,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMA_F16_gfx9_t16_e64
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
Expand Down Expand Up @@ -3941,19 +3996,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}

assert(
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
"pre-RA");
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
"present pre-RA");

// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
Expand All @@ -3968,6 +4025,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
Expand Down Expand Up @@ -4052,11 +4110,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,

int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
unsigned NewOpc = getNewFMAAKInst(ST, Opc);
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
Expand All @@ -4071,11 +4125,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
unsigned NewOpc = getNewFMAMKInst(ST, Opc);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
Expand Down Expand Up @@ -4513,6 +4563,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
Expand Down Expand Up @@ -5569,7 +5620,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAC_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
: AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3287,6 +3287,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
(f16 (VOP3NoMods f16:$src1)),
(f16 (VOP3NoMods f16:$src2))),
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
Expand Down
17 changes: 13 additions & 4 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = AMDGPU::V_FMAAK_F16;
break;
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
Expand Down Expand Up @@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = AMDGPU::V_FMAMK_F16;
break;
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
Expand Down Expand Up @@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
Expand Down
Loading

0 comments on commit 80cca23

Please sign in to comment.