| Issue |
76812
|
| Summary |
Basic vectorized compares should fallback on SWAR emulation on non-vector architectures
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
Validark
|
On non-vector architectures, we can efficiently emulate vector operations a lot of times.
First example is pretty similar to https://github.com/llvm/llvm-project/issues/66159. [Godbolt link](https://zig.godbolt.org/z/8WGG4hzfW). (`foo` should be the same as `bar`)
```zig
const std = @import("std");
export fn foo(data: [*]align(64) const u8) bool {
const vec: @Vector(64, u8) = data[0..64].*;
return @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat(0x80)))) == 0;
}
export fn bar(data: [*]align(64) const u8) bool {
var vec_acc: u64 = 0;
inline for (0..8) |i| {
const vec: u64 = @bitCast(data[i*8..][0..8].*);
vec_acc |= vec;
}
return (vec_acc & 0x8080808080808080) == 0;
}
```
Movmasks are pretty useful too.
```zig
export fn foo(data: [*]align(64) const u8) u64 {
const vec: @Vector(64, u8) = data[0..64].*;
const underscores: u64 = @bitCast(vec == @as(@Vector(64, u8), @splat('_')));
const alpha_lo =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('a')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('z'))));
const alpha_hi =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('A')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('Z'))));
const digit =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('0')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('9'))));
return underscores | alpha_lo | alpha_hi | digit;
}
```
In order to do the same thing efficiently on a non-vector architecture, we might do:
```zig
fn bar(data: [*]align(64) const u8) u64 {
var vec_acc: u64 = 0;
inline for (0..8) |i| {
const v: u64 = @bitCast(data[i * 8 ..][0..8].*);
const ones: @TypeOf(v) = @bitCast(@as(@Vector(@divExact(@bitSizeOf(@TypeOf(v)), 8), u8), @splat(1)));
const mask = comptime ones * 0x7F;
const low_7_bits = v & mask;
// Inspired by: http://0x80.pl/notesen/2023-03-06-swar-find-any.html
const non_underscore = (low_7_bits ^ comptime ones * '_') +% mask;
// alpha check:
// Upper 3 bits must be 4 or 6. Lower 5 bits must be between 1 and 26
const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
const non_0x40_or_0x60 = magic_mask +% mask;
const non_alpha = magic_mask +% comptime ones * (0x80 - 27);
// digit check:
// Upper nibble must be 3. Lower nibble must be [0, 9]
const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
const non_digit = flipped_0x30 +% comptime ones * (0x80 - 0xA);
const chunk = ~v & (non_0x40_or_0x60 ^ (non_digit & non_alpha & non_underscore));
vec_acc |= swarMovMask(chunk) << (i * 8);
}
return vec_acc;
}
/// Creates a bitstring from the most significant bit of each byte in a given bitstring.
///
/// E.g. 1....... 0....... 0....... 1....... 0....... 1....... 1....... 1....... => 10010111
fn swarMovMask(v: anytype) @TypeOf(v) {
comptime assert(@divExact(@bitSizeOf(@TypeOf(v)), 8) <= 8);
const ones: @TypeOf(v) = @bitCast(@as(@Vector(@sizeOf(@TypeOf(v)), u8), @splat(1)));
const msb_mask = 0x80 * ones;
// We are exploiting a multiplication as a shifter and adder, and the derivation of this number is
// shown here as a comptime loop.
// This trick is often generalizable to other problems too: https://stackoverflow.com/a/14547307
comptime var mult: @TypeOf(v) = 0;
comptime for (0..@sizeOf(@TypeOf(v))) |i| {
mult |= @as(@TypeOf(v), 1) << (7 * i);
};
// Example with 32 bit integers:
// We want to concentrate the upper bits of each byte into a single nibble.
// Doing the gradeschool multiplication algorithm, we can see that each 1 bit
// in the bottom multiplicand shifts the upper multiplicand, and then we add all these
// shifted bitstrings together. (Note `.` represents a 0)
// a.......b.......c.......d.......
// * ..........1......1......1......1
// -------------------------------------------------------------------------
// a.......b.......c.......d.......
// .b.......c.......d..............
// ..c.......d.....................
// + ...d............................
// -------------------------------------------------------------------------
// abcd....bcd.....cd......d.......
// Then we simply shift to the right by `32 - 4` (bitstring size minus the number of relevant bits) to isolate the desired `abcd` bits in the least significant byte!
// Inspired by: http://0x80.pl/articles/scalar-sse-movmask.html
return (mult *% (v & msb_mask)) >> (@bitSizeOf(@TypeOf(v)) - @sizeOf(@TypeOf(v)));
}
```
This implementation has exactly the same semantics, but [on riscv64 sifive_u74, it reduces the instruction count from 1143 to 204](https://zig.godbolt.org/z/64drbKqfv) (change which function has `export` at the beginning).
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs