|
| 1 | +// RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8 disable-atomic-rmw=true" --cse --split-input-file %s | FileCheck %s |
| 2 | + |
| 3 | +// TODO: remove memref.alloc() in the tests to eliminate noises. |
| 4 | +// memref.alloc exists here because sub-byte vector data types such as i2 |
| 5 | +// are currently not supported as input arguments. |
| 6 | + |
| 7 | +///---------------------------------------------------------------------------------------- |
| 8 | +/// vector.store |
| 9 | +///---------------------------------------------------------------------------------------- |
| 10 | + |
| 11 | +func.func @vector_store_i2_const_index_two_partial_stores(%arg0: vector<3xi2>) { |
| 12 | + %0 = memref.alloc() : memref<3x3xi2> |
| 13 | + %c0 = arith.constant 0 : index |
| 14 | + %c2 = arith.constant 2 : index |
| 15 | + vector.store %arg0, %0[%c2, %c0] :memref<3x3xi2>, vector<3xi2> |
| 16 | + return |
| 17 | +} |
| 18 | + |
| 19 | +// Emit two non-atomic RMW partial stores. Store 6 bits from the input vector (bits [12:18)), |
| 20 | +// into bytes [1:2] from a 3-byte output memref. Due to partial storing, |
| 21 | +// both bytes are accessed partially through masking. |
| 22 | + |
| 23 | +// CHECK: func @vector_store_i2_const_index_two_partial_stores( |
| 24 | +// CHECK-SAME: %[[ARG0:.+]]: vector<3xi2>) |
| 25 | +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8> |
| 26 | +// CHECK: %[[C1:.+]] = arith.constant 1 : index |
| 27 | + |
| 28 | +// Part 1 RMW sequence |
| 29 | +// CHECK: %[[CST:.+]] = arith.constant dense<[false, false, true, true]> |
| 30 | +// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2> |
| 31 | +// CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[ARG0]] |
| 32 | +// CHECK-SAME: {offsets = [0], sizes = [2], strides = [1]} : vector<3xi2> to vector<2xi2> |
| 33 | +// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[EXTRACT]], %[[CST0]] |
| 34 | +// CHECK-SAME: {offsets = [2], strides = [1]} : vector<2xi2> into vector<4xi2> |
| 35 | +// CHECK: %[[LOAD:.+]] = vector.load |
| 36 | +// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2> |
| 37 | +// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[DOWNCAST]] |
| 38 | +// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[SELECT]] |
| 39 | +// CHECK: vector.store %[[UPCAST]], %[[ALLOC]][%[[C1]]] |
| 40 | + |
| 41 | +// Part 2 RMW sequence |
| 42 | +// CHECK: %[[OFFSET:.+]] = arith.addi %[[C1]], %[[C1]] : index |
| 43 | +// CHECK: %[[EXTRACT2:.+]] = vector.extract_strided_slice %[[ARG0]] |
| 44 | +// CHECK-SAME: {offsets = [2], sizes = [1], strides = [1]} : vector<3xi2> to vector<1xi2> |
| 45 | +// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[EXTRACT2]], %[[CST0]] |
| 46 | +// CHECK-SAME: {offsets = [0], strides = [1]} : vector<1xi2> into vector<4xi2> |
| 47 | +// CHECK: %[[CST1:.+]] = arith.constant dense<[true, false, false, false]> : vector<4xi1> |
| 48 | +// CHECK: %[[LOAD2:.+]] = vector.load |
| 49 | +// CHECK: %[[UPCAST2:.+]] = vector.bitcast %[[LOAD2]] : vector<1xi8> to vector<4xi2> |
| 50 | +// CHECK: %[[SELECT2:.+]] = arith.select %[[CST1]], %[[INSERT2]], %[[UPCAST2]] |
| 51 | +// CHECK: %[[DOWNCAST2:.+]] = vector.bitcast %[[SELECT2]] |
| 52 | +// CHECK: vector.store %[[DOWNCAST2]], %[[ALLOC]][%[[OFFSET]]] |
| 53 | + |
| 54 | + |
| 55 | +// ----- |
| 56 | + |
| 57 | +func.func @vector_store_i2_two_partial_one_full_stores(%arg0: vector<7xi2>) { |
| 58 | + %0 = memref.alloc() : memref<3x7xi2> |
| 59 | + %c0 = arith.constant 0 : index |
| 60 | + %c1 = arith.constant 1 : index |
| 61 | + vector.store %arg0, %0[%c1, %c0] :memref<3x7xi2>, vector<7xi2> |
| 62 | + return |
| 63 | +} |
| 64 | + |
| 65 | +// In this example, emit two RMW stores and one full-width store. |
| 66 | + |
| 67 | +// CHECK: func @vector_store_i2_two_partial_one_full_stores( |
| 68 | +// CHECK-SAME: %[[ARG0:.+]]: |
| 69 | +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<6xi8> |
| 70 | +// CHECK: %[[C1:.+]] = arith.constant 1 : index |
| 71 | +// CHECK: %[[CST:.+]] = arith.constant dense<[false, false, false, true]> |
| 72 | +// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2> |
| 73 | +// CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[ARG0]] |
| 74 | +// CHECK-SAME: {offsets = [0], sizes = [1], strides = [1]} |
| 75 | +// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[EXTRACT]], %[[CST0]] |
| 76 | +// CHECK-SAME: {offsets = [3], strides = [1]} |
| 77 | +// First sub-width RMW: |
| 78 | +// CHECK: %[[LOAD:.+]] = vector.load %[[ALLOC]][%[[C1]]] |
| 79 | +// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2> |
| 80 | +// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[UPCAST]] |
| 81 | +// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[SELECT]] |
| 82 | +// CHECK: vector.store %[[DOWNCAST]], %[[ALLOC]][%[[C1]]] |
| 83 | + |
| 84 | +// Full-width store: |
| 85 | +// CHECK: %[[INDEX:.+]] = arith.addi %[[C1]], %[[C1]] |
| 86 | +// CHECK: %[[EXTRACT1:.+]] = vector.extract_strided_slice %[[ARG0]] |
| 87 | +// CHECK-SAME: {offsets = [1], sizes = [4], strides = [1]} |
| 88 | +// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[EXTRACT1]] |
| 89 | +// CHECK: vector.store %[[BITCAST]], %[[ALLOC]][%[[INDEX]]] |
| 90 | + |
| 91 | +// Second sub-width RMW: |
| 92 | +// CHECK: %[[INDEX2:.+]] = arith.addi %[[INDEX]], %[[C1]] |
| 93 | +// CHECK: %[[EXTRACT2:.+]] = vector.extract_strided_slice %[[ARG0]] |
| 94 | +// CHECK-SAME: {offsets = [5], sizes = [2], strides = [1]} |
| 95 | +// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[EXTRACT2]] |
| 96 | +// CHECK-SAME: {offsets = [0], strides = [1]} |
| 97 | +// CHECK: %[[CST1:.+]] = arith.constant dense<[true, true, false, false]> |
| 98 | +// CHECK: %[[LOAD2:.+]] = vector.load %[[ALLOC]][%[[INDEX2]]] |
| 99 | +// CHECK: %[[UPCAST2:.+]] = vector.bitcast %[[LOAD2]] |
| 100 | +// CHECK: %[[SELECT2:.+]] = arith.select %[[CST1]], %[[INSERT2]], %[[UPCAST2]] |
| 101 | +// CHECK: %[[DOWNCAST2:.+]] = vector.bitcast %[[SELECT2]] |
| 102 | +// CHECK: vector.store %[[DOWNCAST2]], %[[ALLOC]][%[[INDEX2]]] |
| 103 | + |
| 104 | +// ----- |
| 105 | + |
| 106 | +func.func @vector_store_i2_const_index_one_partial_store(%arg0: vector<1xi2>) { |
| 107 | + %0 = memref.alloc() : memref<4x1xi2> |
| 108 | + %c0 = arith.constant 0 : index |
| 109 | + %c1 = arith.constant 1 : index |
| 110 | + vector.store %arg0, %0[%c1, %c0] :memref<4x1xi2>, vector<1xi2> |
| 111 | + return |
| 112 | +} |
| 113 | + |
| 114 | +// in this test, only emit partial RMW store as the store is within one byte. |
| 115 | + |
| 116 | +// CHECK: func @vector_store_i2_const_index_one_partial_store( |
| 117 | +// CHECK-SAME: %[[ARG0:.+]]: vector<1xi2>) |
| 118 | +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1xi8> |
| 119 | +// CHECK: %[[C0:.+]] = arith.constant 0 : index |
| 120 | +// CHECK: %[[CST:.+]] = arith.constant dense<[false, true, false, false]> |
| 121 | +// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2> |
| 122 | +// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CST0]] |
| 123 | +// CHECK-SAME: {offsets = [1], strides = [1]} : vector<1xi2> into vector<4xi2> |
| 124 | +// CHECK: %[[LOAD:.+]] = vector.load %[[ALLOC]][%[[C0]]] : memref<1xi8>, vector<1xi8> |
| 125 | +// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2> |
| 126 | +// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[UPCAST]] |
| 127 | +// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[SELECT]] |
| 128 | +// CHECK: vector.store %[[DOWNCAST]], %[[ALLOC]][%[[C0]]] |
0 commit comments