Issue 175446
Summary [SLP] SLP tends to create reduce_or(shl(zext(x))) sequences when a bitcast would do
Labels llvm:SLPVectorizer, missed-optimization, llvm::vectorcombine
Assignees
Reporter RKSimon
    Pulled out of #128424 (but there's been plenty of other cases as well)

When we're packing sequential integers together into a single larger integer (often to return as {i64,i64} aggregates), we end up vectorising to some form of reduce_or(shl(zext(x))) sequence. The shifts position the zero-extended subintegers into place and the reduce_or is disjoint:

```ll
define { i64, i64 } @AverageRoundU8_1(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) local_unnamed_addr #0 {
entry:
  %0 = trunc i64 %a.coerce0 to i16
  %1 = lshr i16 %0, 8
  %2 = trunc i64 %a.coerce1 to i16
  %3 = lshr i16 %2, 8
  %4 = trunc i64 %b.coerce0 to i16
  %5 = lshr i16 %4, 8
  %6 = trunc i64 %b.coerce1 to i16
  %7 = lshr i16 %6, 8
  %conv1 = and i64 %a.coerce0, 255
  %conv4 = and i64 %b.coerce0, 255
  %add = add nuw nsw i64 %conv1, 1
  %add5 = add nuw nsw i64 %add, %conv4
  %add.1 = add nuw nsw i16 %1, 1
  %add5.1 = add nuw nsw i16 %add.1, %5
  %conv1.8 = and i64 %a.coerce1, 255
  %conv4.8 = and i64 %b.coerce1, 255
  %add.8 = add nuw nsw i64 %conv1.8, 1
  %add5.8 = add nuw nsw i64 %add.8, %conv4.8
  %add.9 = add nuw nsw i16 %3, 1
  %add5.9 = add nuw nsw i16 %add.9, %7
  %8 = shl nuw i16 %add5.1, 7
  %9 = and i16 %8, -256
  %10 = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 %a.coerce0, i64 0
  %11 = shufflevector <8 x i64> %10, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
  %12 = lshr <8 x i64> %11, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
  %13 = and <8 x i64> %12, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
  %retval.sroa.2.0.insert.shift.masked = zext i16 %9 to i64
  %14 = insertelement <8 x i64> poison, i64 %b.coerce0, i64 0
  %15 = insertelement <8 x i64> %14, i64 %add5, i64 6
  %16 = insertelement <8 x i64> %15, i64 %retval.sroa.2.0.insert.shift.masked, i64 7
  %17 = shufflevector <8 x i64> %16, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
  %18 = lshr <8 x i64> %17, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
  %19 = and <8 x i64> %18, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
  %20 = add nuw nsw <8 x i64> %13, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
  %21 = add nuw nsw <8 x i64> %19, %20
  %22 = shl nuw <8 x i64> %21, <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
  %23 = and <8 x i64> %22, <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
  %24 = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %23)
  %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %24, 0
  %25 = shl nuw i16 %add5.9, 7
  %26 = and i16 %25, -256
  %27 = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 %a.coerce1, i64 0
  %28 = shufflevector <8 x i64> %27, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
  %29 = lshr <8 x i64> %28, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
  %30 = and <8 x i64> %29, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
  %retval.sroa.11.8.insert.shift.masked = zext i16 %26 to i64
 %31 = insertelement <8 x i64> poison, i64 %b.coerce1, i64 0
  %32 = insertelement <8 x i64> %31, i64 %add5.8, i64 6
  %33 = insertelement <8 x i64> %32, i64 %retval.sroa.11.8.insert.shift.masked, i64 7
  %34 = shufflevector <8 x i64> %33, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
  %35 = lshr <8 x i64> %34, <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
  %36 = and <8 x i64> %35, <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
 %37 = add nuw nsw <8 x i64> %30, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
  %38 = add nuw nsw <8 x i64> %36, %37
  %39 = shl nuw <8 x i64> %38, <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
 %40 = and <8 x i64> %39, <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
  %41 = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %40)
 %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %41, 1
  ret { i64, i64 } %.fca.1.insert
}
```
https://godbolt.org/z/x774d7Kxb

This would be a lot better if we created bitcasts (possibly with a shuffle before hand depending on the shift amounts):

```ll
----------------------------------------
define i64 @src_fwd(<8 x i8> noundef %a0) {
#0:
  %x0 = zext <8 x i8> noundef %a0 to <8 x i64>
  %s = shl nuw <8 x i64> %x0, { 0, 8, 16, 24, 32, 40, 48, 56 }
 %res = reduce_or <8 x i64> %s ; MUST PROVE THIS IS DISJOINT
  ret i64 %res
}
=>
define i64 @tgt_fwd(<8 x i8> noundef %a0) {
#0:
  %x0 = bitcast <8 x i8> noundef %a0 to i64
  ret i64 %x0
}
Transformation seems to be correct!

define i64 @src_rev(<8 x i8> noundef %a0) {
#0:
  %x0 = zext <8 x i8> noundef %a0 to <8 x i64>
  %s = shl nuw <8 x i64> %x0, { 56, 48, 40, 32, 24, 16, 8, 0 }
  %res = reduce_or <8 x i64> %s ; MUST PROVE THIS IS DISJOINT
  ret i64 %res
}
=>
define i64 @tgt_rev(<8 x i8> noundef %a0) {
#0:
  %r0 = shufflevector <8 x i8> noundef %a0, <8 x i8> poison, 7, 6, 5, 4, 3, 2, 1, 0
  %x0 = bitcast <8 x i8> %r0 to i64
  ret i64 %x0
}
Transformation seems to be correct!
```
https://alive2.llvm.org/ce/z/cHHrb8

Ideally we'd perform this in SLP, but it might be necessary to handle this in VectorCombine instead.

CC @alexey-bataev 
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to