| Issue |
175446
|
| Summary |
[SLP] SLP tends to create reduce_or(shl(zext(x))) sequences when a bitcast would do
|
| Labels |
llvm:SLPVectorizer,
missed-optimization,
llvm::vectorcombine
|
| Assignees |
|
| Reporter |
RKSimon
|
Pulled out of #128424 (but there's been plenty of other cases as well)
When we're packing sequential integers together into a single larger integer (often to return as {i64,i64} aggregates), we end up vectorising to some form of reduce_or(shl(zext(x))) sequence. The shifts position the zero-extended subintegers into place and the reduce_or is disjoint:
```ll
define { i64, i64 } @AverageRoundU8_1(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) local_unnamed_addr #0 {
entry:
%0 = trunc i64 %a.coerce0 to i16
%1 = lshr i16 %0, 8
%2 = trunc i64 %a.coerce1 to i16
%3 = lshr i16 %2, 8
%4 = trunc i64 %b.coerce0 to i16
%5 = lshr i16 %4, 8
%6 = trunc i64 %b.coerce1 to i16
%7 = lshr i16 %6, 8
%conv1 = and i64 %a.coerce0, 255
%conv4 = and i64 %b.coerce0, 255
%add = add nuw nsw i64 %conv1, 1
%add5 = add nuw nsw i64 %add, %conv4
%add.1 = add nuw nsw i16 %1, 1
%add5.1 = add nuw nsw i16 %add.1, %5
%conv1.8 = and i64 %a.coerce1, 255
%conv4.8 = and i64 %b.coerce1, 255
%add.8 = add nuw nsw i64 %conv1.8, 1
%add5.8 = add nuw nsw i64 %add.8, %conv4.8
%add.9 = add nuw nsw i16 %3, 1
%add5.9 = add nuw nsw i16 %add.9, %7
%8 = shl nuw i16 %add5.1, 7
%9 = and i16 %8, -256
%10 = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 %a.coerce0, i64 0
%11 = shufflevector <8 x i64> %10, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
%12 = lshr <8 x i64> %11, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
%13 = and <8 x i64> %12, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
%retval.sroa.2.0.insert.shift.masked = zext i16 %9 to i64
%14 = insertelement <8 x i64> poison, i64 %b.coerce0, i64 0
%15 = insertelement <8 x i64> %14, i64 %add5, i64 6
%16 = insertelement <8 x i64> %15, i64 %retval.sroa.2.0.insert.shift.masked, i64 7
%17 = shufflevector <8 x i64> %16, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
%18 = lshr <8 x i64> %17, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
%19 = and <8 x i64> %18, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
%20 = add nuw nsw <8 x i64> %13, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
%21 = add nuw nsw <8 x i64> %19, %20
%22 = shl nuw <8 x i64> %21, <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
%23 = and <8 x i64> %22, <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
%24 = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %23)
%.fca.0.insert = insertvalue { i64, i64 } poison, i64 %24, 0
%25 = shl nuw i16 %add5.9, 7
%26 = and i16 %25, -256
%27 = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 %a.coerce1, i64 0
%28 = shufflevector <8 x i64> %27, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
%29 = lshr <8 x i64> %28, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
%30 = and <8 x i64> %29, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
%retval.sroa.11.8.insert.shift.masked = zext i16 %26 to i64
%31 = insertelement <8 x i64> poison, i64 %b.coerce1, i64 0
%32 = insertelement <8 x i64> %31, i64 %add5.8, i64 6
%33 = insertelement <8 x i64> %32, i64 %retval.sroa.11.8.insert.shift.masked, i64 7
%34 = shufflevector <8 x i64> %33, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
%35 = lshr <8 x i64> %34, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
%36 = and <8 x i64> %35, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
%37 = add nuw nsw <8 x i64> %30, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
%38 = add nuw nsw <8 x i64> %36, %37
%39 = shl nuw <8 x i64> %38, <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
%40 = and <8 x i64> %39, <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
%41 = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %40)
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %41, 1
ret { i64, i64 } %.fca.1.insert
}
```
https://godbolt.org/z/x774d7Kxb
This would be a lot better if we created bitcasts (possibly with a shuffle before hand depending on the shift amounts):
```ll
----------------------------------------
define i64 @src_fwd(<8 x i8> noundef %a0) {
#0:
%x0 = zext <8 x i8> noundef %a0 to <8 x i64>
%s = shl nuw <8 x i64> %x0, { 0, 8, 16, 24, 32, 40, 48, 56 }
%res = reduce_or <8 x i64> %s ; MUST PROVE THIS IS DISJOINT
ret i64 %res
}
=>
define i64 @tgt_fwd(<8 x i8> noundef %a0) {
#0:
%x0 = bitcast <8 x i8> noundef %a0 to i64
ret i64 %x0
}
Transformation seems to be correct!
define i64 @src_rev(<8 x i8> noundef %a0) {
#0:
%x0 = zext <8 x i8> noundef %a0 to <8 x i64>
%s = shl nuw <8 x i64> %x0, { 56, 48, 40, 32, 24, 16, 8, 0 }
%res = reduce_or <8 x i64> %s ; MUST PROVE THIS IS DISJOINT
ret i64 %res
}
=>
define i64 @tgt_rev(<8 x i8> noundef %a0) {
#0:
%r0 = shufflevector <8 x i8> noundef %a0, <8 x i8> poison, 7, 6, 5, 4, 3, 2, 1, 0
%x0 = bitcast <8 x i8> %r0 to i64
ret i64 %x0
}
Transformation seems to be correct!
```
https://alive2.llvm.org/ce/z/cHHrb8
Ideally we'd perform this in SLP, but it might be necessary to handle this in VectorCombine instead.
CC @alexey-bataev
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs