On 5/23/19 3:50 AM, David Hildenbrand wrote: > /* > * Returns the number of bits composing one element. > */ > static uint8_t get_element_bits(uint8_t es) > { > return (1 << es) * BITS_PER_BYTE; > } > > /* > * Returns the bitmask for a single element. > */ > static uint64_t get_single_element_mask(uint8_t es) > { > return -1ull >> (64 - get_element_bits(es)); > } > > /* > * Returns the bitmask for a single element (excluding the MSB). > */ > static uint64_t get_single_element_lsbs_mask(uint8_t es) > { > return -1ull >> (65 - get_element_bits(es)); > } > > /* > * Returns the bitmasks for multiple elements (excluding the MSBs). > */ > static uint64_t get_element_lsbs_mask(uint8_t es) > { > return dup_const(es, get_single_element_lsbs_mask(es)); > } > > static int vfae(void *v1, const void *v2, const void *v3, bool in, > bool rt, bool zs, uint8_t es) > { > const uint64_t mask = get_element_lsbs_mask(es); > const int bits = get_element_bits(es); > uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; > uint64_t first_zero = 16; > uint64_t first_equal; > int i; > > a0 = s390_vec_read_element64(v2, 0); > a1 = s390_vec_read_element64(v2, 1); > b0 = s390_vec_read_element64(v3, 0); > b1 = s390_vec_read_element64(v3, 1); > e0 = 0; > e1 = 0; > /* compare against equality with every other element */ > for (i = 0; i < 64; i += bits) { > t0 = i ? rol64(b0, i) : b0; > t1 = i ? rol64(b1, i) : b1; > e0 |= zero_search(a0 ^ t0, mask); > e0 |= zero_search(a0 ^ t1, mask); > e1 |= zero_search(a1 ^ t0, mask); > e1 |= zero_search(a1 ^ t1, mask); > }
I don't see that this is doing what you want. You're shifting one element of B down, but not broadcasting it so that it is compared against every element of A. I'd expect something like t0 = dup_const(es, b0 >> i); t1 = dup_const(es, b1 >> i); (I also don't see what rol is getting you that shift doesn't.) > /* invert the result if requested - invert only the MSBs */ > if (in) { > e0 = ~e0 & ~mask; > e1 = ~e1 & ~mask; > } > first_equal = match_index(e0, e1); > > if (zs) { > z0 = zero_search(a0, mask); > z1 = zero_search(a1, mask); > first_zero = match_index(z0, z1); > } > > if (rt) { > e0 = (e0 >> (bits - 1)) * get_single_element_mask(es); > e1 = (e1 >> (bits - 1)) * get_single_element_mask(es); > s390_vec_write_element64(v1, 0, e0); > s390_vec_write_element64(v1, 1, e1); > } else { > s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero)); > s390_vec_write_element64(v1, 1, 0); > } > > if (first_zero == 16 && first_equal == 16) { > return 3; /* no match */ > } else if (first_zero == 16) { > return 1; /* matching elements, no match for zero */ > } else if (first_equal < first_zero) { > return 2; /* matching elements before match for zero */ > } > return 0; /* match for zero */ > } The rest of this looks good. r~