[llvm-bugs] [Bug 97945] [arm/aarch64] Should LD4 be used to load multiple (vector) constants at once?

LLVM Bugs via llvm-bugs Sun, 07 Jul 2024 03:43:59 -0700

Issue	97945
Summary	[arm/aarch64] Should LD4 be used to load multiple (vector) constants at once?
Labels	new issue
Assignees
Reporter	Validark

    Consider the following code:

```zig
const std = @import("std");


export fn foo(indices: @Vector(16, u8)) @Vector(16, u8) {
    const iota: [64]u8 = std.simd.iota(u8, 64); // counts from 0 to 63 inclusive
    return tbl4(iota[0..16].*, iota[16..32].*, iota[32..48].*, iota[48..64].*, indices);
}

fn tbl4(table_part_1: @Vector(16, u8), table_part_2: @Vector(16, u8), table_part_3: @Vector(16, u8), table_part_4: @Vector(16, u8), indices: @Vector(16, u8)) @TypeOf(indices) {
    return struct {
 extern fn @"llvm.aarch64.neon.tbl4"(@TypeOf(table_part_1), @TypeOf(table_part_2), @TypeOf(table_part_3), @TypeOf(table_part_4), @TypeOf(indices)) @TypeOf(indices);
 }.@"llvm.aarch64.neon.tbl4"(table_part_1, table_part_2, table_part_3, table_part_4, indices);
}
```

Here is the emit:

```asm
.LCPI0_0:
        .byte   48
        ...
 .byte   63
.LCPI0_1:
        .byte   32
        ...
 .byte   47
.LCPI0_2:
        .byte   16
        ...
        .byte 31
.LCPI0_3:
        .byte   0
        ...
        .byte 15
foo:
        adrp    x8, .LCPI0_0
        ldr     q4, [x8, :lo12:.LCPI0_0]
        adrp    x8, .LCPI0_1
        ldr     q3, [x8, :lo12:.LCPI0_1]
        adrp    x8, .LCPI0_2
        ldr     q2, [x8, :lo12:.LCPI0_2]
        adrp    x8, .LCPI0_3
        ldr     q1, [x8, :lo12:.LCPI0_3]
        tbl     v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
        ret
```

I am not sure if this is correct aarch64 assembly, but couldn't we have instead done this?

```asm
.LCPI0_0:
        .byte   0
        .byte 16
        .byte   32
        .byte   48
        ... ; deinterlaced constants
foo:
        adrp    x8, .LCPI0_0
        ld4     { v1.16b, v2.16b, v3.16b, v4.16b }, [x8, :lo12:.LCPI0_0]
        tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
 ret
```

(The particular constants used in this code is just for demonstration purposes. Obviously we could do better in this case, zeroing out numbers higher than 63, with a `cmhi` followed by a `bic`)

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 97945] [arm/aarch64] Should LD4 be used to load multiple (vector) constants at once?

Reply via email to