| Issue |
71469
|
| Summary |
[mlir][linalg] Peeling can't determine when scalable-tiled loops shouldn't be peeled
|
| Labels |
mlir:linalg,
mlir
|
| Assignees |
matthias-springer
|
| Reporter |
dcaballe
|
I think there is a gap in the analysis that determines if a loop needs peeling or not.
Let's say we have the following loop, tiled for scalable vectorization (see %0 and %1):
```
#map = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)>
module {
func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
%0 = vector.vscale
%1 = arith.muli %0, %c32 : index
%2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
%3 = scf.for %arg5 = %c0 to %dim_0 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%4 = affine.min #map(%arg5)[%1, %dim_0]
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %4] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %4] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%5 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%6 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %5) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %4] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %4] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%7 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %7 into %arg8[0, 0] [1, %4] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [1, %4] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %3 : tensor<?x?xf32>
}
return %2 : tensor<?x?xf32>
}
}
```
We then apply peeling with `--scf-for-loop-peeling` and we get the following output, which looks good to me:
```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0)>
module {
func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
%0 = vector.vscale
%1 = arith.muli %0, %c32 : index
%2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
%3 = affine.apply #map()[%dim_0, %1]
%4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%7 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %6) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%8 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %8 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %7 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
%5 = scf.for %arg5 = %3 to %dim_0 step %1 iter_args(%arg6 = %4) -> (tensor<?x?xf32>) {
%6 = affine.apply #map1(%arg5)[%dim_0]
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %6] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %6] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %6] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %6] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %6] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %6] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %5 : tensor<?x?xf32>
}
return %2 : tensor<?x?xf32>
}
}
```
However, if we take the output after peeling above and remove the partial iteration:
```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0)>
module {
func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
%0 = vector.vscale
%1 = arith.muli %0, %c32 : index
%2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
%3 = affine.apply #map()[%dim_0, %1]
%4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%7 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %6) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%8 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %8 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %7 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %4 : tensor<?x?xf32>
}
return %2 : tensor<?x?xf32>
}
}
```
And apply peeling again, the loop is peeled again, which is wrong:
```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map1 = affine_map<()[s0, s1] -> (s1 - s1 mod s0 - (s1 - s1 mod s0) mod s0)>
module {
func.func @dot_dispatch_0_matmul_DxDxD_f32(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
%0 = vector.vscale
%1 = arith.muli %0, %c32 : index
%2 = scf.for %arg3 = %c0 to %dim step %c1 iter_args(%arg4 = %arg0) -> (tensor<?x?xf32>) {
%3 = affine.apply #map()[%dim_0, %1]
%4 = affine.apply #map1()[%1, %dim_0]
%5 = scf.for %arg5 = %c0 to %4 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
%6 = scf.for %arg5 = %4 to %3 step %1 iter_args(%arg6 = %5) -> (tensor<?x?xf32>) {
%extracted_slice = tensor.extract_slice %arg1[%arg3, 0] [1, %dim_1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[0, %arg5] [%dim_1, %1] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_3 : tensor<1x?xf32>) -> tensor<1x?xf32>
%8 = scf.for %arg7 = %c0 to %dim_1 step %c1 iter_args(%arg8 = %7) -> (tensor<1x?xf32>) {
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg7] [1, 1] [1, 1] : tensor<1x?xf32> to tensor<1x1xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg7, 0] [1, %1] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_6 = tensor.extract_slice %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%9 = linalg.matmul ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1xf32>, tensor<1x?xf32>) outs(%extracted_slice_6 : tensor<1x?xf32>) -> tensor<1x?xf32>
%inserted_slice_7 = tensor.insert_slice %9 into %arg8[0, 0] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<1x?xf32>
scf.yield %inserted_slice_7 : tensor<1x?xf32>
}
%inserted_slice = tensor.insert_slice %8 into %arg6[%arg3, %arg5] [1, %1] [1, 1] : tensor<1x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %6 : tensor<?x?xf32>
}
return %2 : tensor<?x?xf32>
}
}
```
Peeling should be able to determine that the loop will compute a number of iterations multiple of `%1`, which also happens to be the step, and don't peel the loop again:
```
#map = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
%3 = affine.apply #map()[%dim_0, %1]
%4 = scf.for %arg5 = %c0 to %3 step %1 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
```
This issue shows up in IREE, where we peel the same loop multiple times.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs