Issue 76812
Summary Basic vectorized compares should fallback on SWAR emulation on non-vector architectures
Labels new issue
Assignees
Reporter Validark
    On non-vector architectures, we can efficiently emulate vector operations a lot of times.

First example is pretty similar to https://github.com/llvm/llvm-project/issues/66159. [Godbolt link](https://zig.godbolt.org/z/8WGG4hzfW). (`foo` should be the same as `bar`)

```zig
const std = @import("std");

export fn foo(data: [*]align(64) const u8) bool {
    const vec: @Vector(64, u8) = data[0..64].*;
    return @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat(0x80)))) == 0;
}

export fn bar(data: [*]align(64) const u8) bool {
    var vec_acc: u64 = 0;
    inline for (0..8) |i| {
 const vec: u64 = @bitCast(data[i*8..][0..8].*);
        vec_acc |= vec;
    }
    return (vec_acc & 0x8080808080808080) == 0;
}
```

Movmasks are pretty useful too.

```zig
export fn foo(data: [*]align(64) const u8) u64 {
    const vec: @Vector(64, u8) = data[0..64].*;
    const underscores: u64 = @bitCast(vec == @as(@Vector(64, u8), @splat('_')));
    const alpha_lo =
 @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('a')))) &
 @as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('z'))));

 const alpha_hi =
        @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('A')))) &
        @as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('Z'))));

    const digit =
        @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('0')))) &
        @as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('9'))));
    return underscores | alpha_lo | alpha_hi | digit;
}
```

In order to do the same thing efficiently on a non-vector architecture, we might do:

```zig
fn bar(data: [*]align(64) const u8) u64 {
    var vec_acc: u64 = 0;
 inline for (0..8) |i| {
        const v: u64 = @bitCast(data[i * 8 ..][0..8].*);
        const ones: @TypeOf(v) = @bitCast(@as(@Vector(@divExact(@bitSizeOf(@TypeOf(v)), 8), u8), @splat(1)));
        const mask = comptime ones * 0x7F;
        const low_7_bits = v & mask;

        // Inspired by: http://0x80.pl/notesen/2023-03-06-swar-find-any.html
        const non_underscore = (low_7_bits ^ comptime ones * '_') +% mask;

 // alpha check:
        // Upper 3 bits must be 4 or 6. Lower 5 bits must be between 1 and 26
        const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
        const non_0x40_or_0x60 = magic_mask +% mask;
        const non_alpha = magic_mask +% comptime ones * (0x80 - 27);

        // digit check:
        // Upper nibble must be 3. Lower nibble must be [0, 9]
        const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
        const non_digit = flipped_0x30 +% comptime ones * (0x80 - 0xA);
        const chunk = ~v & (non_0x40_or_0x60 ^ (non_digit & non_alpha & non_underscore));
        vec_acc |= swarMovMask(chunk) << (i * 8);
    }

    return vec_acc;
}

/// Creates a bitstring from the most significant bit of each byte in a given bitstring.
///
/// E.g. 1....... 0....... 0....... 1....... 0....... 1....... 1....... 1....... => 10010111
fn swarMovMask(v: anytype) @TypeOf(v) {
    comptime assert(@divExact(@bitSizeOf(@TypeOf(v)), 8) <= 8);
    const ones: @TypeOf(v) = @bitCast(@as(@Vector(@sizeOf(@TypeOf(v)), u8), @splat(1)));
 const msb_mask = 0x80 * ones;

    // We are exploiting a multiplication as a shifter and adder, and the derivation of this number is
    // shown here as a comptime loop.
    // This trick is often generalizable to other problems too: https://stackoverflow.com/a/14547307
 comptime var mult: @TypeOf(v) = 0;
    comptime for (0..@sizeOf(@TypeOf(v))) |i| {
        mult |= @as(@TypeOf(v), 1) << (7 * i);
    };

    // Example with 32 bit integers:
    // We want to concentrate the upper bits of each byte into a single nibble.
    // Doing the gradeschool multiplication algorithm, we can see that each 1 bit
    // in the bottom multiplicand shifts the upper multiplicand, and then we add all these
    // shifted bitstrings together. (Note `.` represents a 0)
    //   a.......b.......c.......d.......
    // * ..........1......1......1......1
    // -------------------------------------------------------------------------
 //   a.......b.......c.......d.......
    // .b.......c.......d..............
    // ..c.......d.....................
    // + ...d............................
    // -------------------------------------------------------------------------
 //   abcd....bcd.....cd......d.......

    // Then we simply shift to the right by `32 - 4` (bitstring size minus the number of relevant bits) to isolate the desired `abcd` bits in the least significant byte!

 // Inspired by: http://0x80.pl/articles/scalar-sse-movmask.html
    return (mult *% (v & msb_mask)) >> (@bitSizeOf(@TypeOf(v)) - @sizeOf(@TypeOf(v)));
}
```

This implementation has exactly the same semantics, but [on riscv64 sifive_u74, it reduces the instruction count from 1143 to 204](https://zig.godbolt.org/z/64drbKqfv) (change which function has `export` at the beginning). 
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to