[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244

lukel97 · 2025-01-09T10:28:09Z

The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232)

In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length.

We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader.

I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.

llvmbot · 2025-01-09T10:28:51Z

@llvm/pr-subscribers-backend-risc-v

Author: Luke Lau (lukel97)

Changes

The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232)

In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length.

We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader.

I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.

Full diff: https://github.com/llvm/llvm-project/pull/122244.diff

3 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp (+15-6)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll (+2-2)
(modified) llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll (+57)

diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index f1e974f973cbe7..872dc1556999a0 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -212,10 +212,6 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
     assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
            "Expected one operand of phi to be Inc");
 
-    // Only proceed if the step is loop invariant.
-    if (!L->isLoopInvariant(Step))
-      return false;
-
     // Step should be a splat.
     Step = getSplatValue(Step);
     if (!Step)
@@ -311,18 +307,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
   }
   case Instruction::Mul: {
     Start = Builder.CreateMul(Start, SplatOp, "start");
-    Step = Builder.CreateMul(Step, SplatOp, "step");
     Stride = Builder.CreateMul(Stride, SplatOp, "stride");
     break;
   }
   case Instruction::Shl: {
     Start = Builder.CreateShl(Start, SplatOp, "start");
-    Step = Builder.CreateShl(Step, SplatOp, "step");
     Stride = Builder.CreateShl(Stride, SplatOp, "stride");
     break;
   }
   }
 
+  // Adjust the step value after its definition if it's an instruction.
+  if (auto *StepI = dyn_cast<Instruction>(Step))
+    Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef());
+
+  switch (BO->getOpcode()) {
+  default:
+    break;
+  case Instruction::Mul:
+    Step = Builder.CreateMul(Step, SplatOp, "step");
+    break;
+  case Instruction::Shl:
+    Step = Builder.CreateShl(Step, SplatOp, "step");
+    break;
+  }
+
   Inc->setOperand(StepIndex, Step);
   BasePtr->setIncomingValue(StartBlock, Start);
   return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 2cbbfc019ab4df..f26dfda3759898 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -320,8 +320,8 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
 ; CHECK-LABEL: @gather_unknown_pow2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]]
-; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SHIFT]]
+; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]]
+; CHECK-NEXT:    [[STEP:%.*]] = shl i64 8, [[SHIFT]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index b1ece9fa8272db..87c8aa392062a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -398,3 +398,60 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
 declare i64 @llvm.vscale.i64()
 declare void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
 declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+
+define <vscale x 1 x i64> @gather_loop_variant_step(ptr %a, i32 %len) {
+; CHECK-LABEL: @gather_loop_variant_step(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
+; CHECK-NEXT:    [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT:    [[STEP:%.*]] = shl i64 [[EVL_ZEXT]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 256, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> undef, i32 [[TMP1]])
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+  %wide.trip.count = zext i32 %len to i64
+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+
+  %elems = sub i64 %wide.trip.count, %index
+  %evl = call i32 @llvm.experimental.get.vector.length(i64 %elems, i32 1, i1 true)
+  %evl.zext = zext i32 %evl to i64
+
+  %offset = shl <vscale x 1 x i64> %vec.ind, splat (i64 4)
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %offset, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+  %accum.next = add <vscale x 1 x i64> %accum, %gather
+
+  %index.next = add nuw i64 %index, %evl.zext
+
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %evl.zext, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+
+  %3 = icmp ne i64 %index.next, %wide.trip.count
+  br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret <vscale x 1 x i64> %accum.next
+}

github-actions · 2025-01-09T10:31:42Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 02403f4e450b86d93197dd34045ff40a34b21494 ec358834c99f0222262d35c5fed3fd24e787cd0d llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll

The following files introduce new uses of undef:

llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

preames

I think the reasoning is okay here because having a strided start + splat(A) + splat(B) + splat(C) is still a strided pattern. I'm a bit concerned about the logic for unwinding through non-recurrence adjustments, but I think this is correct. I'd like to see a bit more targeted testing (in particular, remove EVL!) to convince myself this is generally correct.

preames · 2025-01-14T16:33:47Z

llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp

@@ -311,18 +307,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
  }
  case Instruction::Mul: {
    Start = Builder.CreateMul(Start, SplatOp, "start");
-    Step = Builder.CreateMul(Step, SplatOp, "step");


This block of code now entirely duplicates the end of matchStridedStart, maybe we can extract out a helper and use it in both places? This an idea, not a requirement.

Good catch. It looks like matchStridedStart doesn't set the IR names though and I don't want to introduce more potential diff, so I've left a todo to revisit this

preames · 2025-01-14T16:36:39Z

llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll

+  %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph


I think you need some additional tests here. In particular, please add a test or two which doesn't use EVL as the loop-varying stride. Something like an unrelated load makes the point this isn't EVL specific.

You should probably also have a test where the step immediately proceeds the inc to make the point about insertion point.

I've reworked the tests so that the EVL loops are now separate and at the bottom, and added new tests that check what happens when the vector indices are scaled with shl (checking that the scalar step is scaled in the new insertion point)

EDIT: To clarify, I agree it's better to test without the get.vector.length intrinsic. I think it's still useful though to keep a small test with what we expect the loop vectorizer to emit as a "functional test"

topperc · 2025-01-14T18:36:35Z

llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll

+
+  %offset = shl <vscale x 1 x i64> %vec.ind, splat (i64 4)
+  %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %offset, i32 3
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)


Why are we using masked.gather here instead of vp.gather?

We should have a test for both. This patch is independent of vp.gather vs masked.gather.

This patch is independent of vp.gather vs masked.gather.

I know but the tested loop is a bit odd with masked.gather because the pointer advances by get.vector.length but the gather reads VLMAX elements.

It was written before #122232 landed, I'll switch it over to VP intrinsics

See my other comment in the tests about EVL isn't relevant to these tests at all.

None of the changes in #122232 or the upcoming #122244 are specific to the EVL, so split out the EVL tail-folded loops into separate "integration tests" that reflect the output of the loop vectorizer.

The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see llvm#122232) In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length. We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader. I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.

lukel97 · 2025-01-15T11:41:30Z

I think the reasoning is okay here because having a strided start + splat(A) + splat(B) + splat(C) is still a strided pattern. I'm a bit concerned about the logic for unwinding through non-recurrence adjustments, but I think this is correct.

Yeah the stride is always the same, but as for the non-recurrence adjustments I'm reasoning about it in terms of the first pointer of the vector phi and the resulting scalar phi being equivalent.

If for example we have

%vec.ind = phi <vscale x 1 x ptr> [(start + id * stride), entry], [%vec.ind.next, loop]
%add = add %vec.ind, %x
%mul = mul %vec.ind, %y
%v = gather %mul
...
; variant step each iteration
%vec.ind.next = add %vec.ind, %{a,b,c,...}

We can workout the value of the first pointer each iteration:

1. %mul[0] = (start + x) * y
2. %mul[0] = ((start + a) + x) * y
3. %mul[0] = (((start + b) + a) + x) * y
...

When we transform it to a scalar phi:

%vec.ind.scalar = phi ptr [(start + x) * y, entry], [%vec.ind.scalar.next, loop]
%v = strided.load %vec.ind.scalar
...
; variant step each iteration
%step = mul %{a,b,c,...}, %y
%vec.ind.scalar.next = add %vec.ind, %step

We can work out the address to the strided load

1: addr = (start + x) * y
2: addr = (a * y) + ((start + x) * y)
3: addr = (b * y) + (a * y) + ((start + x) * y)
...

Which at each iteration the address is equivalent to the previous first pointer in the vector indices.

preames

LGTM

llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp

topperc

LGTM

lukel97 requested review from preames, mikhailramalho, mshockwave, michaelmaitland and topperc January 9, 2025 10:28

llvmbot added the backend:RISC-V label Jan 9, 2025

preames reviewed Jan 14, 2025

View reviewed changes

topperc reviewed Jan 14, 2025

View reviewed changes

lukel97 force-pushed the gather-scatter-loop-variant-step branch from ed3f488 to 8b0affb Compare January 15, 2025 10:31

lukel97 added 3 commits January 15, 2025 18:35

Precommit test

cd92763

Add todo about deduplication

2803322

lukel97 force-pushed the gather-scatter-loop-variant-step branch from 8b0affb to 2803322 Compare January 15, 2025 10:36

This was referenced Jan 15, 2025

[RISCV] Don't use EVL/Mask for vid when lowering vp.reverse #123048

Closed

RISC-V EVL tail folding #123069

Open

preames approved these changes Jan 15, 2025

View reviewed changes

topperc reviewed Jan 15, 2025

View reviewed changes

llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp Show resolved Hide resolved

Only adjust the insert point if the step was defined in the loop

ec35883

preames approved these changes Jan 16, 2025

View reviewed changes

topperc approved these changes Jan 16, 2025

View reviewed changes

lukel97 merged commit a761e26 into llvm:main Jan 17, 2025
7 of 8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244

[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244

lukel97 commented Jan 9, 2025

llvmbot commented Jan 9, 2025

github-actions bot commented Jan 9, 2025 •

edited

Loading

preames left a comment

preames Jan 14, 2025

lukel97 Jan 15, 2025

preames Jan 14, 2025

lukel97 Jan 15, 2025 •

edited

Loading

topperc Jan 14, 2025

preames Jan 14, 2025

topperc Jan 14, 2025

lukel97 Jan 14, 2025

preames Jan 15, 2025

lukel97 commented Jan 15, 2025

preames left a comment

topperc left a comment

[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244

[RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering #122244

Conversation

lukel97 commented Jan 9, 2025

llvmbot commented Jan 9, 2025

github-actions bot commented Jan 9, 2025 • edited Loading

preames left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

lukel97 Jan 15, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

lukel97 commented Jan 15, 2025

preames left a comment

Choose a reason for hiding this comment

topperc left a comment

Choose a reason for hiding this comment

github-actions bot commented Jan 9, 2025 •

edited

Loading

lukel97 Jan 15, 2025 •

edited

Loading