Issue 71469
Summary [mlir][linalg] Peeling can't determine when scalable-tiled loops shouldn't be peeled
Labels mlir:linalg, mlir
Assignees matthias-springer
Reporter dcaballe
    I think there is a gap in the analysis that determines if a loop needs peeling or not.

Let's say we have the following loop, tiled for scalable vectorization (see %0 and %1):

```
#map = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)>
module {
  func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
 %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
    %dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
    %0 = vector.vscale
    %1 = arith.muli %0, %c32 : index
    %2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
      %3 = scf.for %arg5 = %c0 to %dim_0 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
        %4 = affine.min #map(%arg5)[%1, %dim_0]
        %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %4] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %4] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
 %6 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %5) -> (tensor<1x?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %4] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %4] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
 %7 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
          %inserted_slice_7 = tensor.insert_slice %7 into %arg8[0, 0] [1, %4] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
 scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
 %inserted_slice = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [1, %4] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      scf.yield %3 : tensor<?x?xf32>
    }
    return %2 : tensor<?x?xf32>
 }
}
```
We then apply peeling with `--scf-for-loop-peeling` and we get the following output, which looks good to me:

```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0)>
module {
  func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
 %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
    %dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
    %0 = vector.vscale
    %1 = arith.muli %0, %c32 : index
    %2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
      %3 = affine.apply #map()[%dim_0, %1]
      %4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
        %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
 %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
        %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
        %7 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %6) -> (tensor<1x?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
 %8 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
          %inserted_slice_7 = tensor.insert_slice %8 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
 scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
 %inserted_slice = tensor.insert_slice %7 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      %5 = scf.for %arg5 = %3 to %dim_0 step %1 iter_args(%arg6 = %4) -> (tensor<?x?xf32>) {
        %6 = affine.apply #map1(%arg5)[%dim_0]
        %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %6] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %6] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
 %8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %6] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %6] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
 %9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
          %inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %6] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
 scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
 %inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %6] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      scf.yield %5 : tensor<?x?xf32>
    }
    return %2 : tensor<?x?xf32>
 }
}
```

However, if we take the output after peeling above and remove the partial iteration:

```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0)>
module {
  func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
 %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
 %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
    %dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
    %0 = vector.vscale
    %1 = arith.muli %0, %c32 : index
    %2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
      %3 = affine.apply #map()[%dim_0, %1]
      %4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
        %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
 %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
        %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
        %7 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %6) -> (tensor<1x?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
 %8 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
          %inserted_slice_7 = tensor.insert_slice %8 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
 scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
 %inserted_slice = tensor.insert_slice %7 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      scf.yield %4 : tensor<?x?xf32>
    }
    return %2 : tensor<?x?xf32>
 }
}
```

And apply peeling again, the loop is peeled again, which is wrong:

```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<()[s0, s1] -> (s1 - s1 mod s0 - (s1 - s1 mod s0) mod s0)>
module {
  func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f32
 %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
 %dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
    %0 = vector.vscale
    %1 = arith.muli %0, %c32 : index
    %2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
      %3 = affine.apply #map()[%dim_0, %1]
      %4 = affine.apply #map1()[%1, %dim_0]
      %5 = scf.for %arg5 = %c0 to %4 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
        %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
 %8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
 %9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
          %inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
 scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
 %inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      %6 = scf.for %arg5 = %4 to %3 step %1 iter_args(%arg6 = %5) -> (tensor<?x?xf32>) {
 %extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
        %extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
        %extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
 %7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
        %8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
 %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
          %extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
          %9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
 %inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
          scf.yield %inserted_slice_7 : tensor<1x?xf32>
        }
        %inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
        scf.yield %inserted_slice : tensor<?x?xf32>
      }
      scf.yield %6 : tensor<?x?xf32>
 }
    return %2 : tensor<?x?xf32>
  }
}
```

Peeling should be able to determine that the loop will compute a number of iterations multiple of `%1`, which also happens to be the step, and don't peel the loop again:
```
      #map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>

      %3 = affine.apply #map()[%dim_0, %1]
      %4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
```

This issue shows up in IREE, where we peel the same loop multiple times.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to