| Issue |
97945
|
| Summary |
[arm/aarch64] Should LD4 be used to load multiple (vector) constants at once?
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
Validark
|
Consider the following code:
```zig
const std = @import("std");
export fn foo(indices: @Vector(16, u8)) @Vector(16, u8) {
const iota: [64]u8 = std.simd.iota(u8, 64); // counts from 0 to 63 inclusive
return tbl4(iota[0..16].*, iota[16..32].*, iota[32..48].*, iota[48..64].*, indices);
}
fn tbl4(table_part_1: @Vector(16, u8), table_part_2: @Vector(16, u8), table_part_3: @Vector(16, u8), table_part_4: @Vector(16, u8), indices: @Vector(16, u8)) @TypeOf(indices) {
return struct {
extern fn @"llvm.aarch64.neon.tbl4"(@TypeOf(table_part_1), @TypeOf(table_part_2), @TypeOf(table_part_3), @TypeOf(table_part_4), @TypeOf(indices)) @TypeOf(indices);
}.@"llvm.aarch64.neon.tbl4"(table_part_1, table_part_2, table_part_3, table_part_4, indices);
}
```
Here is the emit:
```asm
.LCPI0_0:
.byte 48
...
.byte 63
.LCPI0_1:
.byte 32
...
.byte 47
.LCPI0_2:
.byte 16
...
.byte 31
.LCPI0_3:
.byte 0
...
.byte 15
foo:
adrp x8, .LCPI0_0
ldr q4, [x8, :lo12:.LCPI0_0]
adrp x8, .LCPI0_1
ldr q3, [x8, :lo12:.LCPI0_1]
adrp x8, .LCPI0_2
ldr q2, [x8, :lo12:.LCPI0_2]
adrp x8, .LCPI0_3
ldr q1, [x8, :lo12:.LCPI0_3]
tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
ret
```
I am not sure if this is correct aarch64 assembly, but couldn't we have instead done this?
```asm
.LCPI0_0:
.byte 0
.byte 16
.byte 32
.byte 48
... ; deinterlaced constants
foo:
adrp x8, .LCPI0_0
ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x8, :lo12:.LCPI0_0]
tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
ret
```
(The particular constants used in this code is just for demonstration purposes. Obviously we could do better in this case, zeroing out numbers higher than 63, with a `cmhi` followed by a `bic`)
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs