-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThe motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232) In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length. We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader. I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise. Full diff: https://github.com/llvm/llvm-project/pull/122244.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index f1e974f973cbe7..872dc1556999a0 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -212,10 +212,6 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
"Expected one operand of phi to be Inc");
- // Only proceed if the step is loop invariant.
- if (!L->isLoopInvariant(Step))
- return false;
-
// Step should be a splat.
Step = getSplatValue(Step);
if (!Step)
@@ -311,18 +307,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
}
case Instruction::Mul: {
Start = Builder.CreateMul(Start, SplatOp, "start");
- Step = Builder.CreateMul(Step, SplatOp, "step");
Stride = Builder.CreateMul(Stride, SplatOp, "stride");
break;
}
case Instruction::Shl: {
Start = Builder.CreateShl(Start, SplatOp, "start");
- Step = Builder.CreateShl(Step, SplatOp, "step");
Stride = Builder.CreateShl(Stride, SplatOp, "stride");
break;
}
}
+ // Adjust the step value after its definition if it's an instruction.
+ if (auto *StepI = dyn_cast<Instruction>(Step))
+ Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef());
+
+ switch (BO->getOpcode()) {
+ default:
+ break;
+ case Instruction::Mul:
+ Step = Builder.CreateMul(Step, SplatOp, "step");
+ break;
+ case Instruction::Shl:
+ Step = Builder.CreateShl(Step, SplatOp, "step");
+ break;
+ }
+
Inc->setOperand(StepIndex, Step);
BasePtr->setIncomingValue(StartBlock, Start);
return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 2cbbfc019ab4df..f26dfda3759898 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -320,8 +320,8 @@ for.cond.cleanup: ; preds = %vector.body
define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
; CHECK-LABEL: @gather_unknown_pow2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]]
-; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT]]
+; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]]
+; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT]]
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index b1ece9fa8272db..87c8aa392062a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -398,3 +398,60 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
declare i64 @llvm.vscale.i64()
declare void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+
+define <vscale x 1 x i64> @gather_loop_variant_step(ptr %a, i32 %len) {
+; CHECK-LABEL: @gather_loop_variant_step(
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
+; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[STEP:%.*]] = shl i64 [[EVL_ZEXT]], 4
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 256, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> undef, i32 [[TMP1]])
+; CHECK-NEXT: [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
+; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+ %wide.trip.count = zext i32 %len to i64
+ %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+ %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+
+ %elems = sub i64 %wide.trip.count, %index
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %elems, i32 1, i1 true)
+ %evl.zext = zext i32 %evl to i64
+
+ %offset = shl <vscale x 1 x i64> %vec.ind, splat (i64 4)
+ %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %offset, i32 3
+ %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+ %accum.next = add <vscale x 1 x i64> %accum, %gather
+
+ %index.next = add nuw i64 %index, %evl.zext
+
+ %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %evl.zext, i64 0
+ %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+
+ %3 = icmp ne i64 %index.next, %wide.trip.count
+ br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret <vscale x 1 x i64> %accum.next
+}
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 02403f4e450b86d93197dd34045ff40a34b21494 ec358834c99f0222262d35c5fed3fd24e787cd0d llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the reasoning is okay here because having a strided start + splat(A) + splat(B) + splat(C) is still a strided pattern. I'm a bit concerned about the logic for unwinding through non-recurrence adjustments, but I think this is correct. I'd like to see a bit more targeted testing (in particular, remove EVL!) to convince myself this is generally correct.
@@ -311,18 +307,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, | |||
} | |||
case Instruction::Mul: { | |||
Start = Builder.CreateMul(Start, SplatOp, "start"); | |||
Step = Builder.CreateMul(Step, SplatOp, "step"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This block of code now entirely duplicates the end of matchStridedStart, maybe we can extract out a helper and use it in both places? This an idea, not a requirement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. It looks like matchStridedStart doesn't set the IR names though and I don't want to introduce more potential diff, so I've left a todo to revisit this
%1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64() | ||
br label %vector.body | ||
|
||
vector.body: ; preds = %vector.body, %vector.ph |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need some additional tests here. In particular, please add a test or two which doesn't use EVL as the loop-varying stride. Something like an unrelated load makes the point this isn't EVL specific.
You should probably also have a test where the step immediately proceeds the inc to make the point about insertion point.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've reworked the tests so that the EVL loops are now separate and at the bottom, and added new tests that check what happens when the vector indices are scaled with shl (checking that the scalar step is scaled in the new insertion point)
EDIT: To clarify, I agree it's better to test without the get.vector.length intrinsic. I think it's still useful though to keep a small test with what we expect the loop vectorizer to emit as a "functional test"
|
||
%offset = shl <vscale x 1 x i64> %vec.ind, splat (i64 4) | ||
%2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %offset, i32 3 | ||
%gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we using masked.gather here instead of vp.gather?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should have a test for both. This patch is independent of vp.gather vs masked.gather.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch is independent of vp.gather vs masked.gather.
I know but the tested loop is a bit odd with masked.gather because the pointer advances by get.vector.length but the gather reads VLMAX elements.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was written before #122232 landed, I'll switch it over to VP intrinsics
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my other comment in the tests about EVL isn't relevant to these tests at all.
ed3f488
to
8b0affb
Compare
The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see llvm#122232) In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length. We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader. I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.
8b0affb
to
2803322
Compare
Yeah the stride is always the same, but as for the non-recurrence adjustments I'm reasoning about it in terms of the first pointer of the vector phi and the resulting scalar phi being equivalent. If for example we have %vec.ind = phi <vscale x 1 x ptr> [(start + id * stride), entry], [%vec.ind.next, loop]
%add = add %vec.ind, %x
%mul = mul %vec.ind, %y
%v = gather %mul
...
; variant step each iteration
%vec.ind.next = add %vec.ind, %{a,b,c,...} We can workout the value of the first pointer each iteration:
When we transform it to a scalar phi: %vec.ind.scalar = phi ptr [(start + x) * y, entry], [%vec.ind.scalar.next, loop]
%v = strided.load %vec.ind.scalar
...
; variant step each iteration
%step = mul %{a,b,c,...}, %y
%vec.ind.scalar.next = add %vec.ind, %step We can work out the address to the strided load
Which at each iteration the address is equivalent to the previous first pointer in the vector indices. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232)
In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length.
We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader.
I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.