Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Precompute into select arms #6212

Merged
merged 66 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
8b5f4d4
work
kripken Dec 21, 2023
859751f
sad
kripken Dec 21, 2023
6f4449f
work
kripken Dec 21, 2023
e9c87b7
test
kripken Dec 21, 2023
efde43a
format
kripken Dec 21, 2023
5106fe8
fix
kripken Dec 21, 2023
e12d243
test
kripken Dec 22, 2023
55f9c7f
fix
kripken Dec 22, 2023
63a843f
fix
kripken Jan 2, 2024
959ad86
comment
kripken Jan 2, 2024
c69aeb0
test
kripken Jan 2, 2024
b0e861a
test
kripken Jan 2, 2024
c369c8e
work
kripken Jan 2, 2024
35f366e
comments
kripken Jan 2, 2024
d88db54
comment
kripken Jan 2, 2024
025532a
work
kripken Jan 2, 2024
a61980a
ideas
kripken Jan 4, 2024
7efd928
wild
kripken Jan 4, 2024
8c63b82
more
kripken Jan 4, 2024
6a04aed
more
kripken Jan 4, 2024
9f4a9b7
format
kripken Jan 4, 2024
7654883
work
kripken Jan 4, 2024
2b28e97
fix
kripken Jan 4, 2024
3529d6b
clean
kripken Jan 4, 2024
a3379ab
work
kripken Jan 4, 2024
4e6b174
work
kripken Jan 4, 2024
0ff3f96
work
kripken Jan 4, 2024
c84e0d6
work
kripken Jan 4, 2024
dde94ae
almost
kripken Jan 4, 2024
e825d06
work
kripken Jan 4, 2024
df8d86f
test
kripken Jan 4, 2024
01a1fad
test
kripken Jan 4, 2024
a37cc0c
Merge remote-tracking branch 'origin/main' into precompute.partial
kripken Jan 4, 2024
35984d7
fix
kripken Jan 4, 2024
2731967
badd
kripken Jan 5, 2024
5b782a5
fix
kripken Jan 5, 2024
634db03
work
kripken Jan 5, 2024
135b0ff
work
kripken Jan 5, 2024
a9f0abc
work
kripken Jan 5, 2024
5c2fe38
work
kripken Jan 5, 2024
7460234
work
kripken Jan 5, 2024
fda1434
work
kripken Jan 5, 2024
ae0e419
clean
kripken Jan 5, 2024
26f64c3
work
kripken Jan 5, 2024
d56d28b
mitigate cost
kripken Jan 5, 2024
0cf5b7e
Merge remote-tracking branch 'origin/main' into precompute.partial
kripken Jan 5, 2024
89e50ad
fix
kripken Jan 5, 2024
1689d4b
comment
kripken Jan 5, 2024
56f49a9
commentses
kripken Jan 5, 2024
ae59dd9
less
kripken Jan 5, 2024
0792c10
less
kripken Jan 5, 2024
8bd5652
less
kripken Jan 5, 2024
c8f6e0e
work
kripken Jan 5, 2024
b48cefc
work
kripken Jan 5, 2024
65f55b9
work
kripken Jan 5, 2024
51cb246
work
kripken Jan 5, 2024
c1f04e9
work
kripken Jan 5, 2024
d813d65
work
kripken Jan 5, 2024
a7953b9
Merge remote-tracking branch 'origin/main' into precompute.partial
kripken Jan 10, 2024
66c43c7
comment
kripken Jan 10, 2024
ceec9d1
comment
kripken Jan 10, 2024
e6467ba
simpler loop
kripken Jan 10, 2024
5afd4db
add suggested comment
kripken Jan 10, 2024
c561a39
add suggested testcases
kripken Jan 10, 2024
6ecc7b0
simplify test
kripken Jan 10, 2024
6f4d81e
format
kripken Jan 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
330 changes: 318 additions & 12 deletions src/passes/Precompute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,19 @@
// looked at.
//

#include <ir/literal-utils.h>
#include <ir/local-graph.h>
#include <ir/manipulation.h>
#include <ir/properties.h>
#include <ir/utils.h>
#include <pass.h>
#include <support/unique_deferring_queue.h>
#include <wasm-builder.h>
#include <wasm-interpreter.h>
#include <wasm.h>
#include "ir/effects.h"
#include "ir/iteration.h"
#include "ir/literal-utils.h"
#include "ir/local-graph.h"
#include "ir/manipulation.h"
#include "ir/properties.h"
#include "ir/utils.h"
#include "pass.h"
#include "support/insert_ordered.h"
#include "support/unique_deferring_queue.h"
#include "wasm-builder.h"
#include "wasm-interpreter.h"
#include "wasm.h"

namespace wasm {

Expand Down Expand Up @@ -210,9 +213,16 @@ struct Precompute
GetValues getValues;
HeapValues heapValues;

bool canPartiallyPrecompute;

void doWalkFunction(Function* func) {
// Perform partial precomputing only when the optimization level is non-
// trivial, as it is slower and less likely to help.
canPartiallyPrecompute = getPassOptions().optimizeLevel >= 2;

// Walk the function and precompute things.
super::doWalkFunction(func);
partiallyPrecompute(func);
if (!propagate) {
return;
}
Expand All @@ -226,11 +236,13 @@ struct Precompute
// another walk to apply them and perhaps other optimizations that are
// unlocked.
super::doWalkFunction(func);
// We could also try to partially precompute again, but that is a somewhat
// heavy operation, so we only do it the first time, and leave such things
// for later runs of this pass and for --converge.
}
// Note that in principle even more cycles could find further work here, in
// very rare cases. To avoid constructing a LocalGraph again just for that
// unlikely chance, we leave such things for later runs of this pass and for
// --converge.
// unlikely chance, we leave such things for later.
}

template<typename T> void reuseConstantNode(T* curr, Flow flow) {
Expand Down Expand Up @@ -281,6 +293,9 @@ struct Precompute
}
if (flow.breaking()) {
if (flow.breakTo == NONCONSTANT_FLOW) {
// This cannot be turned into a constant, but perhaps we can partially
// precompute it.
considerPartiallyPrecomputing(curr);
return;
}
if (flow.breakTo == RETURN_FLOW) {
Expand Down Expand Up @@ -319,6 +334,273 @@ struct Precompute
}
}

// If we failed to precompute a constant, perhaps we can still precompute part
// of an expression. Specifically, consider this case:
//
// (A
// (select
// (B)
// (C)
// (condition)
// )
// )
//
// Perhaps we can compute A(B) and A(C). If so, we can emit a better select:
//
// (select
// (constant result of A(B))
// (constant result of A(C))
// (condition)
// )
//
// Note that in general for code size we want to move operations *out* of
// selects and ifs (OptimizeInstructions does that), but here we are
// computing two constants which replace three expressions, so it is
// worthwhile.
//
// To do such partial precomputing, in the main pass we note selects that look
// promising. If we find any then we do a second pass later just for that (as
// doing so requires walking up the stack in a manner that we want to avoid in
// the main pass for overhead reasons; see below).
//
// Note that selects are all we really need here: Other passes would turn an
// if into a select if the arms are simple enough, and only in those cases
// (simple arms) do we have a chance at partially precomputing. For example,
// if an arm is a constant then we can, but if it is a call then we can't.)
// However, there are cases like an if with arms with side effects that end in
// precomputable things, that are missed atm TODO
std::unordered_set<Select*> partiallyPrecomputable;

void considerPartiallyPrecomputing(Expression* curr) {
if (!canPartiallyPrecompute) {
return;
}

if (auto* select = curr->dynCast<Select>()) {
// We only have a reasonable hope of success if the select arms are things
// like constants or global gets. At a first approximation, allow the set
// of things we allow in constant initializers (but we can probably allow
// more here TODO).
//
// We also ignore selects with no parent (that are the entire function
// body) as then there is nothing to optimize into their arms.
auto& wasm = *getModule();
if (Properties::isValidConstantExpression(wasm, select->ifTrue) &&
Properties::isValidConstantExpression(wasm, select->ifFalse) &&
getFunction()->body != select) {
partiallyPrecomputable.insert(select);
}
}
}

// To partially precompute selects we walk up the stack from them, like this:
//
// (A
// (B
// (select
// (C)
// (D)
// (condition)
// )
// )
// )
//
// First we try to apply B to C and D. If that works, we arrive at this:
//
// (A
// (select
// (constant result of B(C))
// (constant result of B(D))
// (condition)
// )
// )
//
// We can then proceed to perhaps apply A. However, even if we failed to apply
// B then we can try to apply A and B together, because that combination may
// succeed where incremental work fails, for example:
//
// (global $C
// (struct.new ;; outer
// (struct.new ;; inner
// (i32.const 10)
// )
// )
// )
//
// (struct.get ;; outer
// (struct.get ;; inner
// (select
// (global.get $C)
// (global.get $D)
// (condition)
// )
// )
// )
//
// Applying the inner struct.get to $C leads us to the inner struct.new, but
// that is an interior pointer in the global - it is not something we can
// refer to using a global.get, so precomputing it fails. However, when we
// apply both struct.gets at once we arrive at the outer struct.new, which is
// in fact the global $C, and we succeed.
void partiallyPrecompute(Function* func) {
if (!canPartiallyPrecompute || partiallyPrecomputable.empty()) {
// Nothing to do.
return;
}

// Walk the function to find the parent stacks of the promising selects. We
// copy the stacks and process them later. We do it like this because if we
// wanted to process stacks as we reached them then we'd trip over
// ourselves: when we optimize we replace a parent, but that parent is an
// expression we'll reach later in the walk, so modifying it is unsafe.
struct StackFinder : public ExpressionStackWalker<StackFinder> {
Precompute& parent;

StackFinder(Precompute& parent) : parent(parent) {}

// We will later iterate on this in the order of insertion, which keeps
// things deterministic, and also usually lets us do consecutive work
// like a select nested in another select's condition, simply because we
// will traverse the selects in postorder (however, because we cannot
// always succeed in an incremental manner - see the comment on this
// function - it is possible in theory that some work can happen only in a
// later execution of the pass).
InsertOrderedMap<Select*, ExpressionStack> stackMap;

void visitSelect(Select* curr) {
if (parent.partiallyPrecomputable.count(curr)) {
stackMap[curr] = expressionStack;
}
}
} stackFinder(*this);
stackFinder.walkFunction(func);

// Note which expressions we've modified as we go, as it is invalid to
// modify more than once. This could happen in theory in a situation like
// this:
//
// (ternary.f32.max ;; fictional instruction for explanatory purposes
// (select ..)
// (select ..)
// (f32.infinity)
// )
//
// When we consider the first select we can see that the computation result
// is always infinity, so we can optimize here and replace the ternary. Then
// the same thing happens with the second select, causing the ternary to be
// replaced again, which is unsafe because it no longer exists after we
// precomputed it the first time. (Note that in this example the result is
// the same either way, but at least in theory an instruction could exist
// for whom there was a difference.) In practice it does not seem that wasm
// has instructions capable of this atm but this code is still useful to
// guard against future problems, and as a minor speedup (quickly skip code
// if it was already modified).
std::unordered_set<Expression*> modified;

for (auto& [select, stack] : stackFinder.stackMap) {
// Each stack ends in the select itself, and contains more than the select
// itself (otherwise we'd have ignored the select), i.e., the select has a
// parent that we can try to optimize into the arms.
assert(stack.back() == select);
assert(stack.size() >= 2);
Index selectIndex = stack.size() - 1;
assert(selectIndex >= 1);

if (modified.count(select)) {
// This select was modified; go to the next one.
continue;
}

// Go up through the parents, until we can't do any more work. At each
// parent we'll try to execute it and all intermediate parents into the
// select arms.
for (Index parentIndex = selectIndex - 1; parentIndex != Index(-1);
parentIndex--) {
auto* parent = stack[parentIndex];
if (modified.count(parent)) {
// This parent was modified; exit the loop on parents as no upper
// parent is valid to try either.
break;
}

// If the parent lacks a concrete type then we can't move it into the
// select: the select needs a concrete (and non-tuple) type. For example
// if the parent is a drop or is unreachable, those are things we don't
// want to handle, and we stop here (once we see one such parent we
// can't expect to make any more progress).
if (!parent->type.isConcrete() || parent->type.isTuple()) {
break;
}

// We are precomputing the select arms, but leaving the condition as-is.
// If the condition breaks to the parent, then we can't move the parent
// into the select arms:
//
// (block $name ;; this must stay outside of the select
// (select
// (B)
// (C)
// (block ;; condition
// (br_if $target
//
// Ignore all control flow for simplicity, as they aren't interesting
// for us, and other passes should have removed them anyhow.
if (Properties::isControlFlowStructure(parent)) {
break;
}

// This looks promising, so try to precompute here. What we do is
// precompute twice, once with the select replaced with the left arm,
// and once with the right. If both succeed then we can create a new
// select (with the same condition as before) whose arms are the
// precomputed values.
auto isValidPrecomputation = [&](const Flow& flow) {
// For now we handle simple concrete values. We could also handle
// breaks in principle TODO
return canEmitConstantFor(flow.values) && !flow.breaking() &&
flow.values.isConcrete();
};

// Find the pointer to the select in its immediate parent so that we can
// replace it first with one arm and then the other.
auto** pointerToSelect =
getChildPointerInImmediateParent(stack, selectIndex, func);
*pointerToSelect = select->ifTrue;
auto ifTrue = precomputeExpression(parent);
if (isValidPrecomputation(ifTrue)) {
*pointerToSelect = select->ifFalse;
auto ifFalse = precomputeExpression(parent);
if (isValidPrecomputation(ifFalse)) {
// Wonderful, we can precompute here! The select can now contain the
// computed values in its arms.
select->ifTrue = ifTrue.getConstExpression(*getModule());
Comment on lines +573 to +576
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we essentially have to do twice as much work because we separately evaluate isValidPrecomputation and getConstExpression? Could we reduce overhead by combining them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I see what you mean. isValidPrecomputation is a quick check that a precomputed result is valid for our purposes here (the actual precomputing happened earlier in precomputeExpression). And getConstExpression generates an Expression from a precomputed result. I'm not sure how we could combine those?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I misunderstood what was happening. Disregard!

select->ifFalse = ifFalse.getConstExpression(*getModule());
select->finalize();

// The parent of the select is now replaced by the select.
auto** pointerToParent =
getChildPointerInImmediateParent(stack, parentIndex, func);
*pointerToParent = select;

// Update state for further iterations: Mark everything modified and
// move the select to the parent's location.
for (Index i = parentIndex; i <= selectIndex; i++) {
modified.insert(stack[i]);
}
selectIndex = parentIndex;
stack[selectIndex] = select;
stack.resize(selectIndex + 1);
}
}

// Whether we succeeded to precompute here or not, restore the parent's
// pointer to its original state (if we precomputed, the parent is no
// longer in use, but there is no harm in modifying it).
*pointerToSelect = select;
}
}
}

void visitFunction(Function* curr) {
// removing breaks can alter types
ReFinalize().walkFunctionInModule(curr, getModule());
Expand Down Expand Up @@ -531,6 +813,30 @@ struct Precompute

return true;
}

// Helpers for partial precomputing.

// Given a stack of expressions and the index of an expression in it, find
// the pointer to that expression in the parent. This gives us a pointer that
// allows us to replace the expression.
Expression** getChildPointerInImmediateParent(const ExpressionStack& stack,
Index index,
Function* func) {
if (index == 0) {
// There is nothing above this expression, so the pointer referring to it
// is the function's body.
return &func->body;
}

auto* child = stack[index];
for (auto** currChild : ChildIterator(stack[index - 1]).children) {
if (*currChild == child) {
return currChild;
}
}

WASM_UNREACHABLE("child not found in parent");
}
};

Pass* createPrecomputePass() { return new Precompute(false); }
Expand Down
Loading
Loading