From 345057b9143cba27f5f95668d845d3b050f0a6ea Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Wed, 19 Feb 2025 17:50:10 -0800
Subject: [PATCH 1/2] [Analysis] Use `verify-diagnostics` for print-based tests
 (NFC)

This is more robust because the diagnostics are attached to op
locations in the checking of outputs.
---
 test/Analysis/test-alias.mlir      | 113 +++---
 test/Analysis/test-alignment.mlir  | 595 ++++++++++++++---------------
 test/lib/Analysis/TestAlias.cpp    |  21 +-
 test/lib/Analysis/TestAxisInfo.cpp |  16 +-
 4 files changed, 349 insertions(+), 396 deletions(-)
diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
index 6d885359ee8a..bbc688cc6ad9 100644
--- a/test/Analysis/test-alias.mlir
+++ b/test/Analysis/test-alias.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --mlir-disable-threading -test-print-alias -split-input-file 2>&1 | FileCheck %s
+// RUN: triton-opt %s -test-print-alias -verify-diagnostics -o /dev/null
 
 #AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
@@ -11,8 +11,6 @@
 
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
 
-// CHECK-LABEL: matmul_loop
-// CHECK-NOT: ->
 // There shouldn't be any aliasing with the dot op encoding.
 tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
   %a_ptr_init = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
@@ -38,47 +36,42 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
   tt.return
 }
 
-// CHECK-LABEL: alloc
 tt.func @alloc(%A : !tt.ptr<f16>) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %cst2 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   tt.return
 }
 
-// CHECK-LABEL: alloc_init
 tt.func @alloc_init(%A : !tt.ptr<f16>) {
   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %cst1 = ttg.local_alloc %cst0 : (tensor<16x16xf16, #AL>) -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory>
   tt.return
 }
 
-// CHECK-LABEL: trans
 tt.func @trans(%A : !tt.ptr<f16>) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %tensor = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK: %1 -> %0
+  // expected-remark @below {{%1 -> %0}}
   %b = ttg.memdesc_trans %tensor {order=array<i32: 1,0>} : !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x16xf16, #A_SHARED_T, #ttg.shared_memory, mutable>
   tt.return
 }
 
-// CHECK-LABEL: subview
 tt.func @subview(%A : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory>) {
   %index = arith.constant 0 : i32
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %a = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %1 -> %0
+  // expected-remark @below {{%1 -> %0}}
   %cst1 = ttg.memdesc_subview %a[%index, %index, %index] : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   tt.return
 }
 
-// CHECK-LABEL: if_alias
 tt.func @if_alias(%i1 : i1) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %a = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK: %1 -> %1
+  // expected-remark @below {{%1 -> %1}}
   %b = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %2 -> %0,%1
+  // expected-remark @below {{%2 -> %0,%1}}
   %cst2 = scf.if %i1 -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> {
     scf.yield %a : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   } else {
@@ -87,20 +80,19 @@ tt.func @if_alias(%i1 : i1) {
   tt.return
 }
 
-// CHECK-LABEL: for
 tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %a = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK: %1 -> %1
+  // expected-remark @below {{%1 -> %1}}
   %b = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK: %2 -> %2
+  // expected-remark @below {{%2 -> %2}}
   %c = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %arg6 -> %0
-  // CHECK-NEXT: %arg7 -> %1
-  // CHECK-NEXT: %arg8 -> %2
-  // CHECK-NEXT: %3#0 -> %0,%1
-  // CHECK-NEXT: %3#1 -> %0,%1
-  // CHECK-NEXT: %3#2 -> %0,%1,%2
+  // expected-remark @below {{%arg6 -> %0}}
+  // expected-remark @below {{%arg7 -> %1}}
+  // expected-remark @below {{%arg8 -> %2}}
+  // expected-remark @below {{%3#0 -> %0,%1}}
+  // expected-remark @below {{%3#1 -> %0,%1}}
+  // expected-remark @below {{%3#2 -> %0,%1,%2}}
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a, %b_shared = %b, %c_shared = %c) ->
   (!ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
     scf.yield %b_shared, %a_shared, %a_shared : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
@@ -108,25 +100,24 @@ tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
   tt.return
 }
 
-// CHECK-LABEL: for_if
 tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %1 -> %1
+  // expected-remark @below {{%1 -> %1}}
   %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %2 -> %2
+  // expected-remark @below {{%2 -> %2}}
   %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %arg7 -> %0
-  // CHECK-NEXT: %arg8 -> %1
-  // CHECK-NEXT: %arg9 -> %2
-  // CHECK-NEXT: %3#0 -> %0,%1
-  // CHECK-NEXT: %3#1 -> %0,%1
-  // CHECK-NEXT: %3#2 -> %0,%1,%2
+  // expected-remark @below {{%arg7 -> %0}}
+  // expected-remark @below {{%arg8 -> %1}}
+  // expected-remark @below {{%arg9 -> %2}}
+  // expected-remark @below {{%3#0 -> %0,%1}}
+  // expected-remark @below {{%3#1 -> %0,%1}}
+  // expected-remark @below {{%3#2 -> %0,%1,%2}}
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) ->
   (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
     scf.if %i1 {
       %index = arith.constant 8 : i32
-      // CHECK-NEXT: %4 -> %0,%1
+      // expected-remark @below {{%4 -> %0,%1}}
       %cst0 = ttg.memdesc_subview %a_shared[%index, %index] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
       scf.yield
     }
@@ -135,32 +126,31 @@ tt.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
   tt.return
 }
 
-// CHECK-LABEL: for_for_if
 tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %1 -> %1
+  // expected-remark @below {{%1 -> %1}}
   %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %2 -> %2
+  // expected-remark @below {{%2 -> %2}}
   %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %arg7 -> %0
-  // CHECK-NEXT: %arg8 -> %1
-  // CHECK-NEXT: %arg9 -> %2
-  // CHECK-NEXT: %3#0 -> %0
-  // CHECK-NEXT: %3#1 -> %1
-  // CHECK-NEXT: %3#2 -> %2,%6,%6
+  // expected-remark @below {{%arg7 -> %0}}
+  // expected-remark @below {{%arg8 -> %1}}
+  // expected-remark @below {{%arg9 -> %2}}
+  // expected-remark @below {{%3#0 -> %0}}
+  // expected-remark @below {{%3#1 -> %1}}
+  // expected-remark @below {{%3#2 -> %2,%6,%6}}
   %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) ->
   (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
-    // CHECK-NEXT: %arg11 -> %2,%6,%6
-    // CHECK-NEXT: %4 -> %2,%6,%6
+    // expected-remark @below {{%arg11 -> %2,%6,%6}}
+    // expected-remark @below {{%4 -> %2,%6,%6}}
     %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
-      // CHECK-NEXT: %5 -> %6,%6
+      // expected-remark @below {{%5 -> %6,%6}}
       %c_shared_next_next = scf.if %i1 -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> {
-        // CHECK-NEXT: %6 -> %6
+        // expected-remark @below {{%6 -> %6}}
         %cst0 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
         scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
       } else {
-        // CHECK-NEXT: %6 -> %6
+        // expected-remark @below {{%6 -> %6}}
         %cst0 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
         scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
       }
@@ -171,24 +161,23 @@ tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>,
   tt.return
 }
 
-// CHECK-LABEL: cf_for
 tt.func @cf_for(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr<f16>, %arg4: !tt.ptr<f16>) {
   %idx = arith.constant 0 : i32
-  // CHECK: %0 -> %0
+  // expected-remark @below {{%0 -> %0}}
   %cst = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %1 -> %1
+  // expected-remark @below {{%1 -> %1}}
   %cst_0 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %2 -> %0
+  // expected-remark @below {{%2 -> %0}}
   %0 = ttg.memdesc_subview %cst[%idx, %idx] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   gpu.barrier
-  // CHECK-NEXT: %3 -> %3
+  // expected-remark @below {{%3 -> %3}}
   %cst_1 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // CHECK-NEXT: %5 -> %0,%1,%3
-  // CHECK-NEXT: %6 -> %0,%1,%3
-  // CHECK-NEXT: %7 -> %0,%1,%3
   cf.br ^bb1(%arg0, %cst, %cst_0, %cst_1 : index, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>)
 ^bb1(%1: index, %2: !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, %3: !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, %4: !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>):  // 2 preds: ^bb0, ^bb2
   %5 = arith.cmpi slt, %1, %arg1 : index
+  // expected-remark @below {{%5 -> %0,%1,%3}}
+  // expected-remark @below {{%6 -> %0,%1,%3}}
+  // expected-remark @below {{%7 -> %0,%1,%3}}
   cf.cond_br %5, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
   gpu.barrier
@@ -196,7 +185,7 @@ tt.func @cf_for(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr<f16>, %
   cf.br ^bb1(%8, %4, %2, %3 : index, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>)
 ^bb3:  // pred: ^bb1
   gpu.barrier
-  // CHECK-NEXT: %10 -> %0
+  // expected-remark @below {{%10 -> %0}}
   %9 = ttg.memdesc_subview %0[%idx, %idx] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   tt.return
 }
diff --git a/test/Analysis/test-alignment.mlir b/test/Analysis/test-alignment.mlir
index 206239ff0910..5f8616667699 100644
--- a/test/Analysis/test-alignment.mlir
+++ b/test/Analysis/test-alignment.mlir
@@ -1,163 +1,157 @@
-// RUN: triton-opt %s -test-print-alignment -split-input-file -o %t 2>&1 | FileCheck %s
+// RUN: triton-opt %s -test-print-alignment -split-input-file -o /dev/null
 
-// CHECK-LABEL: @cast
 tt.func @cast() {
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %cst = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %0 = arith.extsi %cst : i32 to i64
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %cst_tensor = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = tt.bitcast %cst_tensor : tensor<128xi32> -> tensor<128xi64>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @add
 tt.func @add() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>}}
   %2 = arith.addi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 127
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 127}}
   %3 = arith.constant dense<127> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128}}
   %4 = arith.addi %1, %3 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @addptr
 tt.func @addptr(%arg0: !tt.ptr<i1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i16> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<i32> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<i64> {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %cst1 = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %0 = tt.addptr %arg0, %cst1 : !tt.ptr<i1>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %1 = tt.addptr %arg1, %cst1 : !tt.ptr<i8>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [1], constant_value = <none>}}
   %2 = tt.addptr %arg2, %cst1 : !tt.ptr<i16>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %3 = tt.addptr %arg3, %cst1 : !tt.ptr<i32>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>}}
   %4 = tt.addptr %arg4, %cst1 : !tt.ptr<i64>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = 4
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = 4}}
   %cst4 = arith.constant 4 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %5 = tt.addptr %arg0, %cst4 : !tt.ptr<i1>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %6 = tt.addptr %arg1, %cst4 : !tt.ptr<i8>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>}}
   %7 = tt.addptr %arg2, %cst4 : !tt.ptr<i16>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>}}
   %8 = tt.addptr %arg3, %cst4 : !tt.ptr<i32>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>}}
   %9 = tt.addptr %arg4, %cst4 : !tt.ptr<i64>, i32
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>}}
   %11 = tt.expand_dims %10 {axis = 0: i32} : tensor<128xi32> -> tensor<1x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [128, 1], constant_value = <none>}}
   %12 = tt.broadcast %11 : tensor<1x128xi32> -> tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %13 = tt.splat %arg0 : !tt.ptr<i1> -> tensor<128x128x!tt.ptr<i1>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %14 = tt.splat %arg1 : !tt.ptr<i8> -> tensor<128x128x!tt.ptr<i8>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %15 = tt.splat %arg2 : !tt.ptr<i16> -> tensor<128x128x!tt.ptr<i16>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %16 = tt.splat %arg3 : !tt.ptr<i32> -> tensor<128x128x!tt.ptr<i32>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %17 = tt.splat %arg4 : !tt.ptr<i64> -> tensor<128x128x!tt.ptr<i64>>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 16], constancy = [128, 1], constant_value = <none>}}
   %18 = tt.addptr %13, %12 : tensor<128x128x!tt.ptr<i1>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 16], constancy = [128, 1], constant_value = <none>}}
   %19 = tt.addptr %14, %12 : tensor<128x128x!tt.ptr<i8>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [2, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [2, 16], constancy = [128, 1], constant_value = <none>}}
   %20 = tt.addptr %15, %12 : tensor<128x128x!tt.ptr<i16>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [4, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [4, 16], constancy = [128, 1], constant_value = <none>}}
   %21 = tt.addptr %16, %12 : tensor<128x128x!tt.ptr<i32>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [8, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [8, 16], constancy = [128, 1], constant_value = <none>}}
   %22 = tt.addptr %17, %12 : tensor<128x128x!tt.ptr<i64>>, tensor<128x128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @sub
 tt.func @sub() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>}}
   %2 = arith.subi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %3 = arith.subi %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 129
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 129}}
   %4 = arith.constant dense<129> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128}}
   %5 = arith.subi %4, %1 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @mul
 tt.func @mul(%arg0: i64 {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %2 = arith.muli %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128}}
   %3 = arith.constant dense<128> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128}}
   %4 = arith.muli %3, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 2
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [128], constant_value = 2}}
   %5 = arith.constant dense<2> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [256], constancy = [128], constant_value = 256
+  // expeted-remark @below {{contiguity = [1], divisibility = [256], constancy = [128], constant_value = 256}}
   %6 = arith.muli %4, %5 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 4611686018427387904
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 4611686018427387904}}
   %7 = arith.constant 4611686018427387904: i64
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = <none>}}
   %8 = arith.muli %arg0, %7 : i64
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @div
 tt.func @div() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %2 = arith.divsi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %3 = arith.divui %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64}}
   %4 = arith.constant dense<64> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>}}
   %5 = arith.divsi %0, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %6 = arith.divsi %4, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64}}
   %7 = arith.divsi %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66}}
   %8 = arith.constant dense<66> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [2], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [2], constant_value = <none>}}
   %9 = arith.divui %0, %8 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [8192], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [8192], constancy = [1], constant_value = <none>}}
   %10 = tt.make_range {end = 8320 : i32, start = 8192 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>}}
   %11 = arith.divsi %10, %4 : tensor<128xi32>
   tt.return
 }
@@ -165,236 +159,228 @@ tt.func @div() {
 
 // -----
 
-// CHECK-LABEL: @rem
 tt.func @rem() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %1 = arith.constant dense<1> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0}}
   %2 = arith.remsi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %3 = arith.remui %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64}}
   %4 = arith.constant dense<64> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [64], divisibility = [64], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [64], divisibility = [64], constancy = [1], constant_value = <none>}}
   %5 = arith.remsi %0, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [1], constant_value = <none>}}
   %6 = arith.remsi %4, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66}}
   %7 = arith.constant dense<66> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [2], divisibility = [2], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [2], divisibility = [2], constancy = [1], constant_value = <none>}}
   %8 = arith.remui %0, %7 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @expanddims
 tt.func @expanddims() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 2
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [128], constant_value = 2}}
   %1 = arith.constant dense<2> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [1], constant_value = <none>}}
   %2 = arith.muli %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [2, 2], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [2, 2], constancy = [1, 1], constant_value = <none>}}
   %3 = tt.expand_dims %2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @broadcast
 tt.func @broadcast() {
-  // CHECK: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64}}
   %0 = arith.constant dense<64> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 1], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 1], constant_value = 64}}
   %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 128], constant_value = 64}}
   %2 = tt.broadcast %1 : tensor<128x1xi32> -> tensor<128x128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @splat
 tt.func @splat(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>}}
   %0 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x128x!tt.ptr<f32>>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @cmp_all_contiguous
 tt.func @cmp_all_contiguous() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0}}
   %1 = arith.constant dense<0> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %2 = arith.cmpi eq, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %3 = arith.cmpi ne, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>}}
   %4 = arith.cmpi slt, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %5 = arith.cmpi sle, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>}}
   %6 = arith.cmpi sge, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %7 = arith.cmpi sgt, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %8 = arith.cmpi eq, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %9 = arith.cmpi ne, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %10 = arith.cmpi slt, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>}}
   %11 = arith.cmpi sle, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %12 = arith.cmpi sge, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>}}
   %13 = arith.cmpi sgt, %1, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8}}
   %14 = arith.constant dense<8> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %15 = arith.cmpi sgt, %14, %0 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1}}
   %16 = arith.cmpi sgt, %14, %1 : tensor<128xi32>
   tt.return
 }
 
-// CHECK-LABEL: @cmp_partial_contiguous
 tt.func @cmp_partial_contiguous() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8}}
   %1 = arith.constant dense<8> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [32], constancy = [128], constant_value = 32
+  // expeted-remark @below {{contiguity = [1], divisibility = [32], constancy = [128], constant_value = 32}}
   %3 = arith.constant dense<32> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [32], divisibility = [32], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [32], divisibility = [32], constancy = [1], constant_value = <none>}}
   %4 = arith.remsi %0, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %5 = arith.cmpi eq, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %6 = arith.cmpi ne, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %7 = arith.cmpi slt, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %8 = arith.cmpi sle, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %9 = arith.cmpi sge, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %10 = arith.cmpi sgt, %4, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %11 = arith.cmpi eq, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %12 = arith.cmpi ne, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %13 = arith.cmpi slt, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %14 = arith.cmpi sle, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %15 = arith.cmpi sge, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %16 = arith.cmpi sgt, %1, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [128], constant_value = 48
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [128], constant_value = 48}}
   %17 = arith.constant dense<48> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [16], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [16], divisibility = [16], constancy = [1], constant_value = <none>}}
   %18 = arith.remsi %0, %17 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %19 = arith.cmpi eq, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %20 = arith.cmpi ne, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>}}
   %21 = arith.cmpi slt, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %22 = arith.cmpi sle, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>}}
   %23 = arith.cmpi sge, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %24 = arith.cmpi sgt, %18, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %25 = arith.cmpi eq, %3, %18 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %26 = arith.cmpi ne, %3, %18 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %27 = arith.cmpi slt, %3, %18 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>}}
   %28 = arith.cmpi sle, %3, %18 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %29 = arith.cmpi sge, %3, %18 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none}}
   %30 = arith.cmpi sgt, %3, %18 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @logic
 tt.func @logic() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
+  // expeted-remark @below {{contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64}}
   %1 = arith.constant dense<64> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [64], constant_value = <none>}}
   %2 = arith.divsi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8}}
   %3 = arith.constant dense<8> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %4 = arith.divsi %0, %3 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %5 = arith.andi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %6 = arith.ori %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %7 = arith.xori %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %8 = arith.andi %2, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %9 = arith.ori %2, %4 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>}}
   %10 = arith.xori %2, %4 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @select
 tt.func @select(%arg0 : i1, %arg1 : tensor<4xi1>) {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0}}
   %1 = arith.constant dense<0> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %2 = arith.cmpi eq, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>}}
   %3 = arith.cmpi slt, %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %4 = arith.constant 0 : i1
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0}}
   %7 = tt.splat %4 : i1 -> tensor<128xi1>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0}}
   %5 = arith.select %4, %3, %7 : tensor<128xi1>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %8 = arith.select %7, %3, %2 : tensor<128xi1>, tensor<128xi1>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %9 = tt.expand_dims %2 {axis = 1 : i32} : tensor<128xi1> -> tensor<128x1xi1>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 1], constant_value = <none>}}
   %10 = tt.expand_dims %3 {axis = 1 : i32} : tensor<128xi1> -> tensor<128x1xi1>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %11 = arith.select %arg0, %9, %10 : tensor<128x1xi1>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [4], constant_value = 4
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [4], constant_value = 4}}
   %cst = arith.constant dense<4> : tensor<4xi32>
-  // CHECK-NEXT: contiguity = [4], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [4], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %12 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %13 = arith.muli %12, %cst : tensor<4xi32>
-  // CHECK-NEXT: contiguity = [4], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [4], divisibility = [16], constancy = [1], constant_value = <none>}}
   %14 = tt.make_range {end = 20 : i32, start = 16 : i32} : tensor<4xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %15 = arith.select %arg1, %12, %13 : tensor<4xi1>, tensor<4xi32>
   tt.return
 }
@@ -402,23 +388,23 @@ tt.func @select(%arg0 : i1, %arg1 : tensor<4xi1>) {
 // -----
 
 tt.func @shift(%arg0: i32 {tt.divisibility = 4 : i32}) {
-  // CHECK: contiguity = [1], divisibility = [4], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [128], constant_value = <none>}}
   %s = tt.splat %arg0 : i32 -> tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8}}
   %1 = arith.constant dense<8> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4}}
   %2 = arith.constant dense<4> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [256], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [256], constancy = [1], constant_value = <none>}}
   %3 = arith.shli %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %4 = arith.shrsi %0, %2 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128}}
   %5 = arith.shli %1, %2 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = <none>}}
   %6 = arith.shli %1, %s : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %7 = arith.shrsi %0, %s : tensor<128xi32>
   tt.return
 }
@@ -426,34 +412,33 @@ tt.func @shift(%arg0: i32 {tt.divisibility = 4 : i32}) {
 // -----
 
 tt.func @max_min() {
-  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>}}
   %1 = tt.make_range {end = 192 : i32, start = 64 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>}}
   %2 = arith.maxsi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>}}
   %3 = arith.minsi %0, %1 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8}}
   %4 = arith.constant dense<8> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4}}
   %5 = arith.constant dense<4> : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 8}}
   %6 = arith.maxsi %4, %5 : tensor<128xi32>
   tt.return
 }
 
 // -----
 
-// CHECK-LABEL: @if
 tt.func @if(%i1 : i1) {
-  // CHECK: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 32], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 32], constant_value = 64}}
   %cst_64 = arith.constant dense<64> : tensor<128x32xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = 1
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = 1}}
   %cst_1 = arith.constant dense<1> : tensor<128x32xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 32], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 32], constant_value = 64}}
   %a = arith.muli %cst_64, %cst_1 : tensor<128x32xi32>
-  // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>}}
   %ret = scf.if %i1 -> tensor<128x32xi32> {
     scf.yield %a : tensor<128x32xi32>
   } else {
@@ -464,26 +449,25 @@ tt.func @if(%i1 : i1) {
 
 // -----
 
-// CHECK-LABEL: @for
 tt.func @for() {
-  // CHECK: contiguity = [1, 1], divisibility = [4611686018427387904, 4611686018427387904], constancy = [128, 32], constant_value = 0
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [4611686018427387904, 4611686018427387904], constancy = [128, 32], constant_value = 0}}
   %a_init = arith.constant dense<0> : tensor<128x32xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = 1
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = 1}}
   %b_init = arith.constant dense<1> : tensor<128x32xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4}}
   %c_init = arith.constant dense<4> : tensor<128x32xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128}}
   %ub = arith.constant 128 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %lb = arith.constant 0 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16}}
   %step = arith.constant 16 : i32
   %a, %b, %c = scf.for %iv = %lb to %ub step %step iter_args(%a = %a_init, %b = %b_init, %c = %c_init) -> (tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32>) : i32 {
-    // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>
+    // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>}}
     %t = arith.addi %iv, %lb : i32
-    // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>
-    // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>
-    // CHECK: contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>}}
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>}}
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4}}
     scf.yield %b, %a, %c : tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32>
   }
   tt.return
@@ -491,12 +475,11 @@ tt.func @for() {
 
 // -----
 
-// CHECK-LABEL: @for_dynamic
 tt.func @for_dynamic(%lb: i32 {tt.divisibility = 16 : i32}, %step: i32 {tt.divisibility = 8 : i32}, %ub: i32) {
-  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %c0 = arith.constant 0 : i32
   scf.for %iv = %lb to %ub step %step : i32 {
-    // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>
+    // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>}}
     %t = arith.addi %iv, %c0 : i32
   }
   tt.return
@@ -504,31 +487,30 @@ tt.func @for_dynamic(%lb: i32 {tt.divisibility = 16 : i32}, %step: i32 {tt.divis
 
 // -----
 
-// CHECK-LABEL: @for_if
 tt.func @for_if(%i1: i1, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %c0_i32 = arith.constant 0 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %c1_i32 = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [1], constant_value = 10
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [1], constant_value = 10}}
   %c10_i32 = arith.constant 10 : i32
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64}}
   %cst = arith.constant dense<64> : tensor<128x64xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
   %1 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>>
   %2 = scf.for %arg9 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg1 = %1) -> (tensor<128x64x!tt.ptr<f16>>): i32 {
-    // CHECK: scf.if
-    // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+    // expeted-remark @below {{scf.if}}
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
     %3 = scf.if %i1 -> (tensor<128x64x!tt.ptr<f16>>) {
       scf.yield %arg1 : tensor<128x64x!tt.ptr<f16>>
     } else {
       scf.yield %arg1 : tensor<128x64x!tt.ptr<f16>>
     }
-    // CHECK: tt.addptr
-    // CHECK-SAME: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+    // expeted-remark @below {{tt.addptr}}
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
     %4 = tt.addptr %3, %cst : tensor<128x64x!tt.ptr<f16>>, tensor<128x64xi32>
-    // CHECK: scf.for
-    // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+    // expeted-remark @below {{scf.for}}
+    // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
     scf.yield %1 : tensor<128x64x!tt.ptr<f16>>
   }
   tt.return
@@ -536,28 +518,27 @@ tt.func @for_if(%i1: i1, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
 
 // -----
 
-// CHECK-LABEL: @for_if_for
 tt.func @for_if_for(%i1: i1, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 8 : i32}) {
-  // CHECK: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %c0_i32 = arith.constant 0 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %c1_i32 = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [1], constant_value = 10
+  // expeted-remark @below {{contiguity = [1], divisibility = [2], constancy = [1], constant_value = 10}}
   %c10_i32 = arith.constant 10 : i32
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64}}
   %cst = arith.constant dense<64> : tensor<128x64xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
   %1 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>}}
   %2 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>>
-  // CHECK: scf.for
-  // CHECK: contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>
-  // CHECK: scf.if
-  // CHECK: contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>
-  // CHECK: tt.addptr
-  // CHECK-SAME: contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>
-  // CHECK: scf.for
-  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>
+  // expeted-remark @below {{scf.for}}
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>}}
+  // expeted-remark @below {{scf.if}}
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>}}
+  // expeted-remark @below {{tt.addptr}}
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [8, 8], constancy = [128, 64], constant_value = <none>}}
+  // expeted-remark @below {{scf.for}}
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 64], constant_value = <none>}}
   %3 = scf.for %arg9 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg2 = %1) -> (tensor<128x64x!tt.ptr<f16>>) : i32 {
     %4 = scf.if %i1 -> (tensor<128x64x!tt.ptr<f16>>) {
       %5 = scf.for %arg10 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg3 = %2) -> (tensor<128x64x!tt.ptr<f16>>) : i32 {
@@ -575,53 +556,52 @@ tt.func @for_if_for(%i1: i1, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %
 
 // -----
 
-// CHECK-LABEL: @permute_2d
 tt.func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 128], constant_value = 1
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 128], constant_value = 1}}
   %cst = arith.constant dense<true> : tensor<128x128xi1>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>}}
   %2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>}}
   %3 = tt.splat %arg1 : i32 -> tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
   %4 = arith.muli %2, %3 : tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>}}
   %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
   %6 = tt.addptr %5, %4 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>}}
   %7 = tt.expand_dims %1 {axis = 0 : i32}: tensor<128xi32> -> tensor<1x128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>}}
   %8 = tt.broadcast %6 : tensor<128x1x!tt.ptr<f32>> -> tensor<128x128x!tt.ptr<f32>>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [128, 1], constant_value = <none>}}
   %9 = tt.broadcast %7 : tensor<1x128xi32> -> tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [4, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [4, 16], constancy = [1, 1], constant_value = <none>}}
   %10 = tt.addptr %8, %9 : tensor<128x128x!tt.ptr<f32>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>}}
   %11 = tt.expand_dims %0 {axis = 1 : i32}: tensor<128xi32> -> tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>}}
   %12 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>>
-  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 1], constant_value = <none>}}
   %13 = tt.addptr %12, %11 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32>
-  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>}}
   %14 = tt.expand_dims %1 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>}}
   %15 = tt.splat %arg3 : i32 -> tensor<1x128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
   %16 = arith.muli %14, %15 : tensor<1x128xi32>
-  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 128], constant_value = <none>}}
   %17 = tt.broadcast %13 : tensor<128x1x!tt.ptr<f32>> -> tensor<128x128x!tt.ptr<f32>>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>}}
   %18 = tt.broadcast %16 : tensor<1x128xi32> -> tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128, 1], divisibility = [16, 4], constancy = [1, 1], constant_value = <none>}}
   %19 = tt.addptr %17, %18 : tensor<128x128x!tt.ptr<f32>>, tensor<128x128xi32>
-  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %20 = tt.load %10, %cst, %cst_0 : tensor<128x128x!tt.ptr<f32>>
   tt.store %19, %20, %cst : tensor<128x128x!tt.ptr<f32>>
   tt.return
@@ -629,29 +609,28 @@ tt.func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32
 
 // -----
 
-// CHECK-LABEL: @load_constancy
 tt.func @load_constancy(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 1 : i32}) {
-  // CHECK: divisibility = [16]
+  // expeted-remark @below {{divisibility = [16]}}
   %sixteen = arith.constant dense<16> : tensor<1024xi32>
-  // CHECK-NEXT: divisibility = [8]
+  // expeted-remark @below {{divisibility = [8]}}
   %eight = arith.constant dense<8> : tensor<1024xi32>
-  // CHECK-NEXT: contiguity = [1024], divisibility = [1073741824], constancy = [1]
+  // expeted-remark @below {{contiguity = [1024], divisibility = [1073741824], constancy = [1]}}
   %1 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
-  // CHECK-NEXT: constancy = [16]
+  // expeted-remark @below {{constancy = [16]}}
   %2 = arith.divsi %1, %sixteen : tensor<1024xi32>
-  // CHECK-NEXT: constancy = [1024]
+  // expeted-remark @below {{constancy = [1024]}}
   %3 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
-  // CHECK-NEXT: constancy = [1024]
+  // expeted-remark @below {{constancy = [1024]}}
   %4 = tt.splat %arg1 : i32 -> tensor<1024xi32>
-  // CHECK-NEXT: constancy = [8]
+  // expeted-remark @below {{constancy = [8]}}
   %5 = arith.divsi %1, %eight : tensor<1024xi32>
-  // CHECK-NEXT: constancy = [8]
+  // expeted-remark @below {{constancy = [8]}}
   %6 = arith.cmpi slt, %5, %4 : tensor<1024xi32>
-  // CHECK-NEXT: constancy = [16]
+  // expeted-remark @below {{constancy = [16]}}
   %7 = tt.addptr %3, %2 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
-  // CHECK-NEXT: constancy = [16]
+  // expeted-remark @below {{constancy = [16]}}
   %8 = tt.load %7 : tensor<1024x!tt.ptr<f32>>
-  // CHECK-NEXT: constancy = [8]
+  // expeted-remark @below {{constancy = [8]}}
   %9 = tt.load %7, %6 : tensor<1024x!tt.ptr<f32>>
   tt.return
 }
@@ -659,29 +638,28 @@ tt.func @load_constancy(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1:
 // -----
 
 // This is a tiny test for verifying StoreOp-related alignment, It simply store a constant to a buffer.
-// CHECK-LABEL: @store_constant_align
 tt.func @store_constant_align(%addr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n: i32 {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %pid = tt.get_program_id x : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128}}
   %c128_i32 = arith.constant 128 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [1], constant_value = <none>}}
   %1 = arith.muli %pid, %c128_i32 : i32
-  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>}}
   %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
- // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = <none>
+ // expeted-remark @below {{contiguity = [1], divisibility = [128], constancy = [128], constant_value = <none>}}
   %3 = tt.splat %1 : i32 -> tensor<128xi32>
- // CHECK-NEXT: contiguity = [128], divisibility = [128], constancy = [1], constant_value = <none>
+ // expeted-remark @below {{contiguity = [128], divisibility = [128], constancy = [1], constant_value = <none>}}
   %4 = arith.addi %3, %2 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>}}
   %5 = tt.splat %addr : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>>
-  // CHECK-NEXT: contiguity = [128], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [128], divisibility = [16], constancy = [1], constant_value = <none>}}
   %6 = tt.addptr %5, %4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>}}
   %9 = tt.splat %n : i32 -> tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>}}
   %mask = arith.cmpi slt, %4, %9 : tensor<128xi32>
-  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %cst = arith.constant dense<0.0> : tensor<128xf32>
   tt.store %5, %cst, %mask : tensor<128x!tt.ptr<f32>>
   tt.return
@@ -691,7 +669,6 @@ tt.func @store_constant_align(%addr: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 
 // This IR is dumped from vecadd test.
 // Note, the hint {tt.divisibility = 16 : i32} for %n_elements affects the alignment of mask.
-// CHECK-LABEL: @vecadd_mask_align_16
 tt.func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32 {tt.divisibility = 16 : i32}) {
   %c64_i32 = arith.constant 64 : i32
   %0 = tt.get_program_id x : i32
@@ -704,13 +681,13 @@ tt.func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
   %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   %9 = tt.splat %n_elements : i32 -> tensor<64xi32>
-  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // expeted-remark @below {{arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>}}
   %mask = arith.cmpi slt, %4, %9 : tensor<64xi32>
   %11 = tt.load %6, %mask : tensor<64x!tt.ptr<f32>>
   %12 = tt.load %8, %mask : tensor<64x!tt.ptr<f32>>
   %13 = arith.addf %11, %12 : tensor<64xf32>
   %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
-  // CHECK: tt.addptr %{{.*}} => contiguity = [64], divisibility = [16], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{tt.addptr %{{.*}} => contiguity = [64], divisibility = [16], constancy = [1], constant_value = <none>}}
   %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   tt.store %15, %13, %mask : tensor<64x!tt.ptr<f32>>
   tt.return
@@ -720,7 +697,6 @@ tt.func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 
 // This IR is dumped from vecadd test.
 // Note, there is no divisibility hint for %n_elements, Triton should assume its divisibility to be 1 by default.
-// CHECK-LABEL: @vecadd_mask_align_1
 tt.func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32) {
   %c64_i32 = arith.constant 64 : i32
   %0 = tt.get_program_id x : i32
@@ -733,7 +709,7 @@ tt.func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %
   %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   %9 = tt.splat %n_elements : i32 -> tensor<64xi32>
-  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>}}
   %10 = arith.cmpi slt, %4, %9 : tensor<64xi32>
   %11 = tt.load %6, %10 : tensor<64x!tt.ptr<f32>>
   %12 = tt.load %8, %10 : tensor<64x!tt.ptr<f32>>
@@ -749,36 +725,32 @@ tt.func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %
 module {
 
 // We don't use function cloning here, so the alignment info is the gcd of all call sites.
-// CHECK-LABEL: @addptr_hints
 tt.func @addptr_hints(%arg0: !tt.ptr<i32>) {
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %cst1 = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %1 = tt.addptr %arg0, %cst1 : !tt.ptr<i32>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = 4
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = 4}}
   %cst4 = arith.constant 4 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %2 = tt.addptr %arg0, %cst4 : !tt.ptr<i32>, i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16}}
   %cst16 = arith.constant 16 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %3 = tt.addptr %arg0, %cst4 : !tt.ptr<i32>, i32
   tt.return
 }
 
-// CHECK-LABEL: @kernel_div16
 tt.func @kernel_div16(%arg0: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
   tt.call @addptr_hints(%arg0) : (!tt.ptr<i32>) -> ()
   tt.return
 }
 
-// CHECK-LABEL: @kernel_div8
 tt.func @kernel_div8(%arg0: !tt.ptr<i32> {tt.divisibility = 8 : i32}) {
   tt.call @addptr_hints(%arg0) : (!tt.ptr<i32>) -> ()
   tt.return
 }
 
-// CHECK-LABEL: @kernel_div4
 tt.func @kernel_div4(%arg0: !tt.ptr<i32> {tt.divisibility = 4 : i32}) {
   tt.call @addptr_hints(%arg0) : (!tt.ptr<i32>) -> ()
   tt.return
@@ -791,37 +763,33 @@ tt.func @kernel_div4(%arg0: !tt.ptr<i32> {tt.divisibility = 4 : i32}) {
 module {
 
 // We don't use function cloning here, so the alignment info is the gcd of all call sites.
-// CHECK-LABEL: @mul
 tt.func @mul(%arg0: i32) {
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %cst1 = arith.constant 1 : i32
-  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %1 = arith.muli %arg0, %cst1 : i32
   tt.return
 }
 
-// CHECK-LABEL: @bar
 tt.func @bar(%arg0: i32) {
   tt.call @mul(%arg0) : (i32) -> ()
   tt.return
 }
 
-// CHECK-LABEL: @foo
 tt.func @foo(%arg0: i32) {
   tt.call @mul(%arg0) : (i32) -> ()
   tt.return
 }
 
-// CHECK-LABEL: @call_graph
 tt.func @call_graph(%arg0: i32) {
-  // CHECK: contiguity = [1], divisibility = [4], constancy = [1], constant_value = 12
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = 12}}
   %cst12 = arith.constant 12 : i32
-  // CHECK: contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [4], constancy = [1], constant_value = <none>}}
   %0 = arith.muli %arg0, %cst12 : i32
   tt.call @foo(%0) : (i32) -> ()
-  // CHECK: contiguity = [1], divisibility = [8], constancy = [1], constant_value = 8
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [1], constant_value = 8}}
   %cst8 = arith.constant 8 : i32
-  // CHECK: contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1], divisibility = [8], constancy = [1], constant_value = <none>}}
   %1 = arith.muli %arg0, %cst8 : i32
   tt.call @bar(%1) : (i32) -> ()
   tt.return
@@ -831,9 +799,8 @@ tt.func @call_graph(%arg0: i32) {
 
 // -----
 
-// CHECK-LABEL: @tensor_ptr
 tt.func @tensor_ptr(%arg0: !tt.ptr<tensor<64x16xi32>, 1>) {
-  // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %0 = tt.load %arg0 : !tt.ptr<tensor<64x16xi32>, 1>
   tt.return
 }
@@ -841,25 +808,24 @@ tt.func @tensor_ptr(%arg0: !tt.ptr<tensor<64x16xi32>, 1>) {
 
 // -----
 
-// CHECK-LABEL: @chained_for
 tt.func public @chained_for(%8: tensor<128x64x!tt.ptr<bf16>> {tt.divisibility = 16 : i32}) {
-  // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>}}
   %cst = arith.constant dense<0.000000e+00> : tensor<128x64xbf16>
-  // CHECK: contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16
+  // expeted-remark @below {{contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16}}
   %c16_i32 = arith.constant 16 : i32
-  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
+  // expeted-remark @below {{contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1}}
   %c1_i32 = arith.constant 1 : i32
-  // CHECK: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
+  // expeted-remark @below {{contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0}}
   %c0_i32 = arith.constant 0 : i32
-  // CHECK: contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [64, 64], constancy = [128, 64], constant_value = 64}}
   %cst_0 = arith.constant dense<64> : tensor<128x64xi32>
-  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
   %9 = scf.for %arg7 = %c0_i32 to %c16_i32 step %c1_i32 iter_args(%arg8 = %8) -> (tensor<128x64x!tt.ptr<bf16>>)  : i32 {
     %11 = tt.addptr %arg8, %cst_0 : tensor<128x64x!tt.ptr<bf16>>, tensor<128x64xi32>
     scf.yield %11 : tensor<128x64x!tt.ptr<bf16>>
   }
-  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
-  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
+  // expeted-remark @below {{contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>}}
   %10 = scf.for %arg7 = %c0_i32 to %c16_i32 step %c1_i32 iter_args(%arg8 = %9) -> (tensor<128x64x!tt.ptr<bf16>>)  : i32 {
     tt.store %arg8, %cst : tensor<128x64x!tt.ptr<bf16>>
     %11 = tt.addptr %arg8, %cst_0 : tensor<128x64x!tt.ptr<bf16>>, tensor<128x64xi32>
@@ -870,10 +836,9 @@ tt.func public @chained_for(%8: tensor<128x64x!tt.ptr<bf16>> {tt.divisibility =
 
 // -----
 
-// CHECK-LABEL: @int_min_does_not_underflow_in_analysis
 module {
   tt.func @int_min_does_not_underflow_in_analysis() -> i64 {
-    // CHECK: divisibility = [4611686018427387904]
+    // expeted-remark @below {{divisibility = [4611686018427387904]}}
     %int_min = arith.constant -9223372036854775808 : i64
     tt.return %int_min : i64
   }
diff --git a/test/lib/Analysis/TestAlias.cpp b/test/lib/Analysis/TestAlias.cpp
index 0d879902f822..038467aacbdf 100644
--- a/test/lib/Analysis/TestAlias.cpp
+++ b/test/lib/Analysis/TestAlias.cpp
@@ -20,19 +20,19 @@ struct TestAliasPass
     return opName;
   }
 
-  static void print(StringRef name, SmallVector<std::string> &vals,
-                    raw_ostream &os) {
+  static void emit(Location loc, StringRef name,
+                   SmallVector<std::string> &vals) {
     if (vals.empty())
       return;
-    os << name << " -> ";
+    InFlightDiagnostic diag = mlir::emitRemark(loc);
+    diag << name << " -> ";
     size_t i = 0;
     for (auto val : vals) {
       if (i != 0)
-        os << ",";
-      os << val;
+        diag << ",";
+      diag << val;
       ++i;
     }
-    os << "\n";
   }
 
   StringRef getArgument() const final { return "test-print-alias"; }
@@ -42,9 +42,6 @@ struct TestAliasPass
 
   void runOnOperation() override {
     Operation *operation = getOperation();
-    auto &os = llvm::errs();
-    auto opName = SymbolTable::getSymbolName(operation).getValue().str();
-    os << opName << "\n";
 
     std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
     SharedMemoryAliasAnalysis *analysis =
@@ -80,7 +77,7 @@ struct TestAliasPass
             auto operand = block->getArgument(arg.index());
             auto opNames = getLocalAllocOpNames(operand);
             auto argName = getValueOperandName(arg.value(), state);
-            print(argName, opNames, os);
+            emit(op->getLoc(), argName, opNames);
           }
         }
         return;
@@ -90,13 +87,13 @@ struct TestAliasPass
           auto operand = forOp.getTiedLoopInit(arg.value())->get();
           auto opNames = getLocalAllocOpNames(operand);
           auto argName = getValueOperandName(arg.value(), state);
-          print(argName, opNames, os);
+          emit(op->getLoc(), argName, opNames);
         }
       }
       for (auto result : llvm::enumerate(op->getResults())) {
         auto opNames = getLocalAllocOpNames(result.value());
         auto resultName = getValueOperandName(result.value(), state);
-        print(resultName, opNames, os);
+        emit(op->getLoc(), resultName, opNames);
       }
     });
   }
diff --git a/test/lib/Analysis/TestAxisInfo.cpp b/test/lib/Analysis/TestAxisInfo.cpp
index 6dba753e4fa1..54663c36bd00 100644
--- a/test/lib/Analysis/TestAxisInfo.cpp
+++ b/test/lib/Analysis/TestAxisInfo.cpp
@@ -1,3 +1,4 @@
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Analysis/Utility.h"
@@ -22,19 +23,20 @@ struct TestAxisInfoPass
     ModuleOp moduleOp = cast<ModuleOp>(operation);
     ModuleAxisInfoAnalysis moduleAxisInfoAnalysis(moduleOp);
     moduleOp.walk([&](FuncOp funcOp) {
-      auto &os = llvm::errs();
-      auto opName = SymbolTable::getSymbolName(funcOp).getValue().str();
-      os << "@" << opName << "\n";
       funcOp.walk([&](Operation *op) {
         if (op->getNumResults() < 1)
           return;
         for (Value result : op->getResults()) {
-          result.print(os);
-          os << " => ";
+          InFlightDiagnostic diag = mlir::emitRemark(op->getLoc());
+          diag << result;
+          diag << " => ";
           auto *axisInfo = moduleAxisInfoAnalysis.getAxisInfo(result);
-          if (axisInfo)
+          if (axisInfo) {
+            std::string str;
+            llvm::raw_string_ostream os(str);
             axisInfo->print(os);
-          os << "\n";
+            diag << str;
+          }
         }
       });
     });

From bfd5d5cefce32e0401d2510a1424a2c6338658bc Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Wed, 19 Feb 2025 18:23:19 -0800
Subject: [PATCH 2/2] try disabling threading

---
 test/Analysis/test-alias.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
index bbc688cc6ad9..660b66c96c34 100644
--- a/test/Analysis/test-alias.mlir
+++ b/test/Analysis/test-alias.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -test-print-alias -verify-diagnostics -o /dev/null
+// RUN: triton-opt %s -mlir-disable-threading -test-print-alias -verify-diagnostics -o /dev/null
 
 #AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>