On 17.05.19 19:42, Richard Henderson wrote: > On 5/17/19 9:47 AM, Richard Henderson wrote: >> first_equal = n; >> first_zero = n; >> for (i = n - 1; i >= 0; --i) { >> if (data1 == data2) { >> first_equal = i; >> } >> if (data1 == 0) { >> first_zero = i; >> } >> } >> >> // As an aside, there are bit tricks for the above, >> // but let's stay simple(r) for now. > > What the hell, it's not /that/ tricky. > > > /* > * Returns a bit set in the MSB of each element that is zero, > * as defined by the mask M. > */ > static inline uint64_t zero_search(uint64_t a, uint64_t m) > { > return ~(((a & m) + m) | a | m); > } > > /* > * Returns the byte offset for the first match, or 16 for no match. > */ > static inline int match_index(uint64_t c0, uint64_t c1) > { > return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3; > } > > Use > > dup_const(MO_8, 0x7f) > dup_const(MO_16, 0x7fff) > dup_const(MO_32, 0x7fffffff) > > for the M parameter for the different element sizes. > > uint64_t a0, a1, b0, b1, e0, e1, z0, z1; > > a0 = s390_vec_read_element64(v2, 0); > a1 = s390_vec_read_element64(v2, 1); > b0 = s390_vec_read_element64(v3, 0); > b1 = s390_vec_read_element64(v3, 1); > e0 = zero_search(a0 ^ b0, m); > e1 = zero_search(a1 ^ b1, m); > first_equal = match_index(e0, e1); > > if (zs) { > z0 = zero_search(a0, m); > z1 = zero_search(a1, m); > first_zero = match_index(z0, z1); > ... > > > r~ >
Crazy stuff, seems to work (not that I am surprised :D ) I now have: +static int vfee(void *v1, const void *v2, const void *v3, bool zs, uint8_t es) +{ + const uint64_t mask = dup_const(es, -1ull >> (65 - (1 << es) * 8)); + uint64_t a0, a1, b0, b1, e0, e1, z0, z1; + uint64_t first_zero = 16; + uint64_t first_equal; + + a0 = s390_vec_read_element64(v2, 0); + a1 = s390_vec_read_element64(v2, 1); + b0 = s390_vec_read_element64(v3, 0); + b1 = s390_vec_read_element64(v3, 1); + e0 = zero_search(a0 ^ b0, mask); + e1 = zero_search(a1 ^ b1, mask); + first_equal = match_index(e0, e1); + + if (zs) { + z0 = zero_search(a0, mask); + z1 = zero_search(a1, mask); + first_zero = match_index(z0, z1); + } + + /* zero out the destination vector */ + s390_vec_write_element64(v1, 0, 0); + s390_vec_write_element64(v1, 1, 0); + + if (first_zero == 16 && first_equal == 16) { + s390_vec_write_element8(v1, 7, 16); + return 3; /* no match */ + } else if (first_zero == 16) { + s390_vec_write_element8(v1, 7, first_equal); + return 1; /* matching elements, no match for zero */ + } else if (first_equal < first_zero) { + s390_vec_write_element8(v1, 7, first_equal); + return 2; /* matching elements before match for zero */ + } + s390_vec_write_element8(v1, 7, first_zero); + return 0; /* match for zero */ +} -- Thanks, David / dhildenb