Issue 179143
Summary [WebAssembly] bitcast defeats dot product pattern
Labels new issue
Assignees
Reporter folkertdev
    The dot product recognition is not very robust, see:

https://godbolt.org/z/roKzf5xdK

```llvm
define dso_local <4 x i32> @with_bitcast(<4 x i32> %a, <4 x i32> %b) unnamed_addr {
start:
  %_4 = bitcast <4 x i32> %a to <8 x i16>
  %_5 = bitcast <4 x i32> %b to <8 x i16>
  %0 = sext <8 x i16> %_4 to <8 x i32>
  %1 = sext <8 x i16> %_5 to <8 x i32>
  %2 = mul nsw <8 x i32> %1, %0
  %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %4 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %5 = add <4 x i32> %3, %4
  ret <4 x i32> %5
}

define dso_local <4 x i32> @without_bitcast(<8 x i16> %a, <8 x i16> %b) unnamed_addr {
start:
  %0 = sext <8 x i16> %a to <8 x i32>
  %1 = sext <8 x i16> %b to <8 x i32>
  %2 = mul nsw <8 x i32> %1, %0
  %3 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %4 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %5 = add <4 x i32> %3, %4
  ret <4 x i32> %5
}
```

The additional bitcast causes the optimization to be missed

```asm
with_bitcast:                           # @with_bitcast
 local.get       1
        local.get       0
 i32x4.extmul_low_i16x8_s
        local.tee       2
        local.get 1
        local.get       0
        i8x16.shuffle   8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
        local.get       0
        local.get 0
        i8x16.shuffle   8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
        i32x4.extmul_low_i16x8_s
        local.tee       0
 i8x16.shuffle   0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 local.get       2
        local.get       0
        i8x16.shuffle   4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
        i32x4.add
 end_function
without_bitcast:                        # @without_bitcast
        local.get       1
        local.get       0
 i32x4.dot_i16x8_s
        end_function
```

cc @badumbatish https://github.com/llvm/llvm-project/pull/151775

This is a problem in `rust-lang/stdarch` because usually the input type is an opaque `v128` (represented as `<4 x i32>`), so a bitcast is needed.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to