| Issue |
183355
|
| Summary |
[SLP] Improve bswap detection.
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
rj-jesus
|
This might be related to #175446.
Given code like this ([link](https://godbolt.org/z/a6Mrhzavc)):
```c
#include <stdint.h>
typedef uint32_t u32;
typedef uint64_t u64;
#define BSWAP32(x) (((u32)(x)[0] << 24) | \
((x)[1] << 16) | \
((x)[2] << 8) | \
(x)[3])
u32 bswap32(uint8_t *buf) {
return BSWAP32(buf);
}
u64 bswap32_zext64(uint8_t *buf) {
return BSWAP32(buf);
}
u64 bswap64(uint8_t *buf) {
return ((u64)BSWAP32(buf) << 32) +
BSWAP32(buf + 4);
}
```
SLP is able to combine the first function, `bswap32`, into a bswap, but not the two other patterns.
For example, for AArch64 we currently generate:
```gas
bswap32:
ldr w8, [x0]
rev w0, w8
ret
.LCPI1_0:
.word 24
.word 16
.word 8
.word 0
bswap32_zext64:
ldr s0, [x0]
adrp x8, .LCPI1_0
ldr q1, [x8, :lo12:.LCPI1_0]
ushll v0.8h, v0.8b, #0
ushll v0.4s, v0.4h, #0
ushl v0.4s, v0.4s, v1.4s
ext v1.16b, v0.16b, v0.16b, #8
orr v0.8b, v0.8b, v1.8b
fmov x8, d0
lsr x9, x8, #32
orr w0, w8, w9
ret
.LCPI2_0:
.word 24
.word 16
.word 8
.word 0
bswap64:
ldr s0, [x0]
adrp x8, .LCPI2_0
ldrb w10, [x0, #5]
ldr q1, [x8, :lo12:.LCPI2_0]
ldrb w8, [x0, #4]
ushll v0.8h, v0.8b, #0
lsl x8, x8, #24
ushll v0.4s, v0.4h, #0
orr x8, x8, x10, lsl #16
ldrb w10, [x0, #6]
ushl v0.4s, v0.4s, v1.4s
ext v1.16b, v0.16b, v0.16b, #8
orr v0.8b, v0.8b, v1.8b
fmov x9, d0
lsr x11, x9, #32
orr w9, w9, w11
orr x8, x8, x9, lsl #32
ldrb w9, [x0, #7]
orr x8, x8, x10, lsl #8
orr x0, x8, x9
ret
```
Instead of:
```gas
bswap32:
ldr w0, [x0]
rev w0, w0
ret
bswap32_zext64:
ldr w0, [x0]
rev w0, w0
ret
bswap64:
ldr x0, [x0]
rev x0, x0
ret
```
See also https://godbolt.org/z/cqf5ad1Ko for pre-SLP IR. (The backend can detect the input IR as bswap, but not the transformed forms.)
CC: @alexey-bataev
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs