Skip to content

Commit

Permalink
JIT: improve throughput of the RLCSE greedy heuristic
Browse files Browse the repository at this point in the history
Profiling showed that `GetFeatures` was a major factor in throughput. For the
most part the features of CSE candidates don't change as we perform CSEs, so
build in some logic to avoid recomputing the feature set unless there is some
evidence features have changed.

To avoid having to remove already performed candidates from the candidate vector
we now tag them them as `m_performed`l  these get ignored during subsequent processing,
and discarded if we ever recompute features.

This should cut the TP impact roughly in half, the remaining part seems to
largely be from doing more CSEs (which we hope will show some perf benefit).

Contributes to dotnet#92915.
  • Loading branch information
AndyAyersMS committed Feb 25, 2024
1 parent 79dd9ba commit 4fbe96d
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 32 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7057,6 +7057,7 @@ class Compiler
unsigned optCSEstart; // The first local variable number that is a CSE
unsigned optCSEattempt; // The number of CSEs attempted so far.
unsigned optCSEcount; // The total count of CSEs introduced.
unsigned optCSEunmarks; // Number of CSE trees unmarked
weight_t optCSEweight; // The weight of the current block when we are doing PerformCSE
CSE_HeuristicCommon* optCSEheuristic; // CSE Heuristic to use for this method

Expand Down
124 changes: 94 additions & 30 deletions src/coreclr/jit/optcse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ bool Compiler::optUnmarkCSE(GenTree* tree)
// 2. Unmark the CSE information in the node

tree->gtCSEnum = NO_CSE;

// 3. Leave breadcrumbs so we know some dsc was altered

optCSEunmarks++;

return true;
}
else
Expand Down Expand Up @@ -2436,10 +2441,12 @@ void CSE_HeuristicParameterized::GreedyPolicy()
//
const int numCandidates = m_pCompiler->optCSECandidateCount;
ArrayStack<Choice> choices(m_pCompiler->getAllocator(CMK_CSE), numCandidates + 1);
unsigned numUnmarked = m_pCompiler->optCSEunmarks;
bool recomputeFeatures = true;

while (true)
{
Choice& choice = ChooseGreedy(choices);
Choice& choice = ChooseGreedy(choices, recomputeFeatures);
CSEdsc* const dsc = choice.m_dsc;

#ifdef DEBUG
Expand Down Expand Up @@ -2472,7 +2479,16 @@ void CSE_HeuristicParameterized::GreedyPolicy()
JITDUMP("\n");

PerformCSE(&candidate);
madeChanges = true;
madeChanges = true;
choice.m_performed = true;

// If performing this CSE impacted other CSEs, we need to
// recompute all cse features.
//
unsigned newNumUnmarked = m_pCompiler->optCSEunmarks;
assert(newNumUnmarked >= numUnmarked);
recomputeFeatures = (numUnmarked != newNumUnmarked);
numUnmarked = newNumUnmarked;
}

return;
Expand Down Expand Up @@ -2575,7 +2591,7 @@ void CSE_HeuristicParameterized::GetFeatures(CSEdsc* cse, double* features)
unsigned maxPostorderNum = 0;
BasicBlock* minPostorderBlock = nullptr;
BasicBlock* maxPostorderBlock = nullptr;
for (treeStmtLst* treeList = cse->csdTreeList; treeList != nullptr && !isMakeCse; treeList = treeList->tslNext)
for (treeStmtLst* treeList = cse->csdTreeList; treeList != nullptr; treeList = treeList->tslNext)
{
BasicBlock* const treeBlock = treeList->tslBlock;
unsigned postorderNum = treeBlock->bbPostorderNum;
Expand Down Expand Up @@ -2616,7 +2632,6 @@ void CSE_HeuristicParameterized::GetFeatures(CSEdsc* cse, double* features)
// LSRA "is live across call"
//
bool isLiveAcrossCallLSRA = isLiveAcrossCall;

if (!isLiveAcrossCallLSRA)
{
unsigned count = 0;
Expand All @@ -2630,7 +2645,6 @@ void CSE_HeuristicParameterized::GetFeatures(CSEdsc* cse, double* features)
}
}
}

features[23] = booleanScale * isLiveAcrossCallLSRA;
}

Expand Down Expand Up @@ -2748,26 +2762,51 @@ double CSE_HeuristicParameterized::StoppingPreference()
// ChooseGreedy: examine candidates and choose the next CSE to perform
// via greedy policy
//
// Arguments:
// choices -- array of choices, possibly already filled in
// recompute -- if true, rebuild the choice array from scratch
//
// Returns:
// Choice of CSE to perform
//
// Notes:
// Picks the most-preferred candidate.
// If there is a tie, picks stop, or the lowest cse index.
//
CSE_HeuristicParameterized::Choice& CSE_HeuristicParameterized::ChooseGreedy(ArrayStack<Choice>& choices)
CSE_HeuristicParameterized::Choice& CSE_HeuristicParameterized::ChooseGreedy(ArrayStack<Choice>& choices,
bool recompute)
{
choices.Reset();
BuildChoices(choices);
if (recompute)
{
choices.Reset();
BuildChoices(choices);
}
else
{
// Always recompute the stopping preference as this
// reflects ambient state after each CSE.
//
// By convention, this is at TopRef(0).
//
Choice& stopping = choices.TopRef(0);
assert(stopping.m_dsc == nullptr);
stopping.m_preference = StoppingPreference();
}

// Find the maximally preferred case.
//
int choiceNum = 0;

for (int i = 1; i < choices.Height(); i++)
{
Choice& choice = choices.TopRef(i);
Choice& bestChoice = choices.TopRef(choiceNum);
const Choice& choice = choices.TopRef(i);

if (choice.m_performed == true)
{
continue;
}

const Choice& bestChoice = choices.TopRef(choiceNum);

const double delta = choice.m_preference - bestChoice.m_preference;

Expand Down Expand Up @@ -2811,6 +2850,8 @@ CSE_HeuristicParameterized::Choice& CSE_HeuristicParameterized::ChooseGreedy(Arr
//
void CSE_HeuristicParameterized::BuildChoices(ArrayStack<Choice>& choices)
{
JITDUMP("Building choice array...\n");

for (unsigned i = 0; i < m_pCompiler->optCSECandidateCount; i++)
{
CSEdsc* const dsc = sortTab[i];
Expand Down Expand Up @@ -2893,9 +2934,15 @@ void CSE_HeuristicParameterized::DumpChoices(ArrayStack<Choice>& choices, int hi
{
for (int i = 0; i < choices.Height(); i++)
{
Choice& choice = choices.TopRef(i);
CSEdsc* const cse = choice.m_dsc;
const char* msg = i == highlight ? "=>" : " ";
const Choice& choice = choices.TopRef(i);

if (choice.m_performed == true)
{
continue;
}

CSEdsc* const cse = choice.m_dsc;
const char* msg = (i == highlight) ? "=>" : " ";
if (cse != nullptr)
{
printf("%s%2d: " FMT_CSE " preference %10.7f likelihood %10.7f\n", msg, i, cse->csdIndex,
Expand All @@ -2920,9 +2967,15 @@ void CSE_HeuristicParameterized::DumpChoices(ArrayStack<Choice>& choices, CSEdsc
{
for (int i = 0; i < choices.Height(); i++)
{
Choice& choice = choices.TopRef(i);
CSEdsc* const cse = choice.m_dsc;
const char* msg = cse == highlight ? "=>" : " ";
const Choice& choice = choices.TopRef(i);

if (choice.m_performed == true)
{
continue;
}

CSEdsc* const cse = choice.m_dsc;
const char* msg = (cse == highlight) ? "=>" : " ";
if (cse != nullptr)
{
printf("%s%2d: " FMT_CSE " preference %10.7f likelihood %10.7f\n", msg, i, cse->csdIndex,
Expand Down Expand Up @@ -4422,50 +4475,62 @@ bool CSE_HeuristicCommon::IsCompatibleType(var_types cseLclVarTyp, var_types exp
return false;
}

// PerformCSE() takes a successful candidate and performs the appropriate replacements:
//------------------------------------------------------------------------
// PerformCSE: takes a successful candidate and performs the appropriate replacements
//
// Arguments:
// successfulCandidate - cse candidate to perform
//
// It will replace all of the CSE defs with assignments to a new "cse0" LclVar
// and will replace all of the CSE uses with reads of the "cse0" LclVar
//
// It will also put cse0 into SSA if there is just one def.
//
void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
{
AdjustHeuristic(successfulCandidate);
CSEdsc* const dsc = successfulCandidate->CseDsc();

#ifdef DEBUG
// Setup the message arg for lvaGrabTemp()
//
const char* grabTempMessage = "CSE - unknown";
const char* heuristicTempMessage = "";

if (successfulCandidate->IsAggressive())
{
grabTempMessage = "CSE - aggressive";
heuristicTempMessage = ": aggressive";
}
else if (successfulCandidate->IsModerate())
{
grabTempMessage = "CSE - moderate";
heuristicTempMessage = ": moderate";
}
else if (successfulCandidate->IsConservative())
{
grabTempMessage = "CSE - conservative";
heuristicTempMessage = ": conservative";
}
else if (successfulCandidate->IsStressCSE())
{
grabTempMessage = "CSE - stress mode";
heuristicTempMessage = ": stress";
}
else if (successfulCandidate->IsRandom())
{
grabTempMessage = "CSE - random";
heuristicTempMessage = ": random";
}
#endif // DEBUG

/* Introduce a new temp for the CSE */
const char* const grabTempMessage = m_pCompiler->printfAlloc(FMT_CSE "%s", dsc->csdIndex, heuristicTempMessage);

// Add this candidate to the CSE sequence
//
m_sequence->push_back(dsc->csdIndex);

#endif // DEBUG

// we will create a long lifetime temp for the new CSE LclVar
// Allocate a CSE temp
//
unsigned cseLclVarNum = m_pCompiler->lvaGrabTemp(false DEBUGARG(grabTempMessage));
var_types cseLclVarTyp = genActualType(successfulCandidate->Expr()->TypeGet());

LclVarDsc* lclDsc = m_pCompiler->lvaGetDesc(cseLclVarNum);
LclVarDsc* const lclDsc = m_pCompiler->lvaGetDesc(cseLclVarNum);
if (cseLclVarTyp == TYP_STRUCT)
{
m_pCompiler->lvaSetStruct(cseLclVarNum, successfulCandidate->Expr()->GetLayout(m_pCompiler), false);
Expand All @@ -4474,6 +4539,7 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
lclDsc->lvIsCSE = true;

// Record that we created a new LclVar for use as a CSE temp
//
m_addCSEcount++;
m_pCompiler->optCSEcount++;
m_pCompiler->Metrics.CseCount++;
Expand All @@ -4484,11 +4550,9 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
//
// Later we will unmark any nested CSE's for the CSE uses.
//
CSEdsc* dsc = successfulCandidate->CseDsc();
INDEBUG(m_sequence->push_back(dsc->csdIndex));

// If there's just a single def for the CSE, we'll put this
// CSE into SSA form on the fly. We won't need any PHIs.
//
unsigned cseSsaNum = SsaConfig::RESERVED_SSA_NUM;
LclSsaVarDsc* ssaVarDsc = nullptr;

Expand Down
6 changes: 4 additions & 2 deletions src/coreclr/jit/optcse.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,14 @@ class CSE_HeuristicParameterized : public CSE_HeuristicCommon
protected:
struct Choice
{
Choice(CSEdsc* dsc, double preference) : m_dsc(dsc), m_preference(preference), m_softmax(0)
Choice(CSEdsc* dsc, double preference) : m_dsc(dsc), m_preference(preference), m_softmax(0), m_performed(false)
{
}

CSEdsc* m_dsc;
double m_preference;
double m_softmax;
bool m_performed;
};

enum
Expand Down Expand Up @@ -185,7 +187,7 @@ class CSE_HeuristicParameterized : public CSE_HeuristicCommon
double StoppingPreference();
void BuildChoices(ArrayStack<Choice>& choices);

Choice& ChooseGreedy(ArrayStack<Choice>& choices);
Choice& ChooseGreedy(ArrayStack<Choice>& choices, bool recompute);

virtual const char* Name() const
{
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void Compiler::optInit()
optCSECandidateCount = 0;
optCSEattempt = 0;
optCSEheuristic = nullptr;
optCSEunmarks = 0;
}

DataFlow::DataFlow(Compiler* pCompiler) : m_pCompiler(pCompiler)
Expand Down

0 comments on commit 4fbe96d

Please sign in to comment.