Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 5/23/19 8:59 AM, David Hildenbrand wrote: > I guess I can simplify to > > t0 = rol64(b0, i); > t1 = rol64(b1, i); Yes. > My approach: Compare all elements of B at a time > Your approach: Compare a single element of B at a time Ah, I get it now. This makes total sense, and does make the broadcast unnecessary. r~
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 23.05.19 14:34, David Hildenbrand wrote: > On 23.05.19 14:27, Richard Henderson wrote: >> On 5/23/19 3:50 AM, David Hildenbrand wrote: >>> /* >>> * Returns the number of bits composing one element. >>> */ >>> static uint8_t get_element_bits(uint8_t es) >>> { >>> return (1 << es) * BITS_PER_BYTE; >>> } >>> >>> /* >>> * Returns the bitmask for a single element. >>> */ >>> static uint64_t get_single_element_mask(uint8_t es) >>> { >>> return -1ull >> (64 - get_element_bits(es)); >>> } >>> >>> /* >>> * Returns the bitmask for a single element (excluding the MSB). >>> */ >>> static uint64_t get_single_element_lsbs_mask(uint8_t es) >>> { >>> return -1ull >> (65 - get_element_bits(es)); >>> } >>> >>> /* >>> * Returns the bitmasks for multiple elements (excluding the MSBs). >>> */ >>> static uint64_t get_element_lsbs_mask(uint8_t es) >>> { >>> return dup_const(es, get_single_element_lsbs_mask(es)); >>> } >>> >>> static int vfae(void *v1, const void *v2, const void *v3, bool in, >>> bool rt, bool zs, uint8_t es) >>> { >>> const uint64_t mask = get_element_lsbs_mask(es); >>> const int bits = get_element_bits(es); >>> uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; >>> uint64_t first_zero = 16; >>> uint64_t first_equal; >>> int i; >>> >>> a0 = s390_vec_read_element64(v2, 0); >>> a1 = s390_vec_read_element64(v2, 1); >>> b0 = s390_vec_read_element64(v3, 0); >>> b1 = s390_vec_read_element64(v3, 1); >>> e0 = 0; >>> e1 = 0; >>> /* compare against equality with every other element */ >>> for (i = 0; i < 64; i += bits) { >>> t0 = i ? rol64(b0, i) : b0; >>> t1 = i ? rol64(b1, i) : b1; >>> e0 |= zero_search(a0 ^ t0, mask); >>> e0 |= zero_search(a0 ^ t1, mask); >>> e1 |= zero_search(a1 ^ t0, mask); >>> e1 |= zero_search(a1 ^ t1, mask); >>> } >> >> I don't see that this is doing what you want. You're shifting one element >> of B >> down, but not broadcasting it so that it is compared against every element >> of A. >> >> I'd expect something like >> >> t0 = dup_const(es, b0 >> i); >> t1 = dup_const(es, b1 >> i); >> >> (I also don't see what rol is getting you that shift doesn't.) > > Let's assume > > a0 = [0, 1, 2, 3] > a1 = [4, 5, 6, 7] > > b0 = [8, 8, 8, 4] > b1 = [8, 8, 8, 8] > > What I would check is > > First iteration > > a0 == [8, 8, 8, 4] -> no match > a0 == [8, 8, 8, 8] -> no match > a1 == [8, 8, 8, 4] -> no match > a1 == [8, 8, 8, 8] -> no match > > Second iteration > > a0 == [8, 8, 4, 8] -> no match > a0 == [8, 8, 8, 8] -> no match > a1 == [8, 8, 4, 8] > a1 == [8, 8, 8, 8] -> no match > > ... > > Last iteration > > a0 == [4, 8, 8, 8] -> no match > a0 == [8, 8, 8, 8] -> no match > a1 == [4, 8, 8, 8] -> match in first element > a1 == [8, 8, 8, 8] -> no match > > What am i missing? > > I guess I can simplify to t0 = rol64(b0, i); t1 = rol64(b1, i); My approach: Compare all elements of B at a time Your approach: Compare a single element of B at a time If I'm not wrong, it boils down to to whether rol64() or dup_const(es, b0 >> i) is faster ;) -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 23.05.19 14:27, Richard Henderson wrote: > On 5/23/19 3:50 AM, David Hildenbrand wrote: >> /* >> * Returns the number of bits composing one element. >> */ >> static uint8_t get_element_bits(uint8_t es) >> { >> return (1 << es) * BITS_PER_BYTE; >> } >> >> /* >> * Returns the bitmask for a single element. >> */ >> static uint64_t get_single_element_mask(uint8_t es) >> { >> return -1ull >> (64 - get_element_bits(es)); >> } >> >> /* >> * Returns the bitmask for a single element (excluding the MSB). >> */ >> static uint64_t get_single_element_lsbs_mask(uint8_t es) >> { >> return -1ull >> (65 - get_element_bits(es)); >> } >> >> /* >> * Returns the bitmasks for multiple elements (excluding the MSBs). >> */ >> static uint64_t get_element_lsbs_mask(uint8_t es) >> { >> return dup_const(es, get_single_element_lsbs_mask(es)); >> } >> >> static int vfae(void *v1, const void *v2, const void *v3, bool in, >> bool rt, bool zs, uint8_t es) >> { >> const uint64_t mask = get_element_lsbs_mask(es); >> const int bits = get_element_bits(es); >> uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; >> uint64_t first_zero = 16; >> uint64_t first_equal; >> int i; >> >> a0 = s390_vec_read_element64(v2, 0); >> a1 = s390_vec_read_element64(v2, 1); >> b0 = s390_vec_read_element64(v3, 0); >> b1 = s390_vec_read_element64(v3, 1); >> e0 = 0; >> e1 = 0; >> /* compare against equality with every other element */ >> for (i = 0; i < 64; i += bits) { >> t0 = i ? rol64(b0, i) : b0; >> t1 = i ? rol64(b1, i) : b1; >> e0 |= zero_search(a0 ^ t0, mask); >> e0 |= zero_search(a0 ^ t1, mask); >> e1 |= zero_search(a1 ^ t0, mask); >> e1 |= zero_search(a1 ^ t1, mask); >> } > > I don't see that this is doing what you want. You're shifting one element of > B > down, but not broadcasting it so that it is compared against every element of > A. > > I'd expect something like > > t0 = dup_const(es, b0 >> i); > t1 = dup_const(es, b1 >> i); > > (I also don't see what rol is getting you that shift doesn't.) Let's assume a0 = [0, 1, 2, 3] a1 = [4, 5, 6, 7] b0 = [8, 8, 8, 4] b1 = [8, 8, 8, 8] What I would check is First iteration a0 == [8, 8, 8, 4] -> no match a0 == [8, 8, 8, 8] -> no match a1 == [8, 8, 8, 4] -> no match a1 == [8, 8, 8, 8] -> no match Second iteration a0 == [8, 8, 4, 8] -> no match a0 == [8, 8, 8, 8] -> no match a1 == [8, 8, 4, 8] a1 == [8, 8, 8, 8] -> no match ... Last iteration a0 == [4, 8, 8, 8] -> no match a0 == [8, 8, 8, 8] -> no match a1 == [4, 8, 8, 8] -> match in first element a1 == [8, 8, 8, 8] -> no match What am i missing? -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 5/23/19 3:50 AM, David Hildenbrand wrote: > /* > * Returns the number of bits composing one element. > */ > static uint8_t get_element_bits(uint8_t es) > { > return (1 << es) * BITS_PER_BYTE; > } > > /* > * Returns the bitmask for a single element. > */ > static uint64_t get_single_element_mask(uint8_t es) > { > return -1ull >> (64 - get_element_bits(es)); > } > > /* > * Returns the bitmask for a single element (excluding the MSB). > */ > static uint64_t get_single_element_lsbs_mask(uint8_t es) > { > return -1ull >> (65 - get_element_bits(es)); > } > > /* > * Returns the bitmasks for multiple elements (excluding the MSBs). > */ > static uint64_t get_element_lsbs_mask(uint8_t es) > { > return dup_const(es, get_single_element_lsbs_mask(es)); > } > > static int vfae(void *v1, const void *v2, const void *v3, bool in, > bool rt, bool zs, uint8_t es) > { > const uint64_t mask = get_element_lsbs_mask(es); > const int bits = get_element_bits(es); > uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; > uint64_t first_zero = 16; > uint64_t first_equal; > int i; > > a0 = s390_vec_read_element64(v2, 0); > a1 = s390_vec_read_element64(v2, 1); > b0 = s390_vec_read_element64(v3, 0); > b1 = s390_vec_read_element64(v3, 1); > e0 = 0; > e1 = 0; > /* compare against equality with every other element */ > for (i = 0; i < 64; i += bits) { > t0 = i ? rol64(b0, i) : b0; > t1 = i ? rol64(b1, i) : b1; > e0 |= zero_search(a0 ^ t0, mask); > e0 |= zero_search(a0 ^ t1, mask); > e1 |= zero_search(a1 ^ t0, mask); > e1 |= zero_search(a1 ^ t1, mask); > } I don't see that this is doing what you want. You're shifting one element of B down, but not broadcasting it so that it is compared against every element of A. I'd expect something like t0 = dup_const(es, b0 >> i); t1 = dup_const(es, b1 >> i); (I also don't see what rol is getting you that shift doesn't.) > /* invert the result if requested - invert only the MSBs */ > if (in) { > e0 = ~e0 & ~mask; > e1 = ~e1 & ~mask; > } > first_equal = match_index(e0, e1); > > if (zs) { > z0 = zero_search(a0, mask); > z1 = zero_search(a1, mask); > first_zero = match_index(z0, z1); > } > > if (rt) { > e0 = (e0 >> (bits - 1)) * get_single_element_mask(es); > e1 = (e1 >> (bits - 1)) * get_single_element_mask(es); > s390_vec_write_element64(v1, 0, e0); > s390_vec_write_element64(v1, 1, e1); > } else { > s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero)); > s390_vec_write_element64(v1, 1, 0); > } > > if (first_zero == 16 && first_equal == 16) { > return 3; /* no match */ > } else if (first_zero == 16) { > return 1; /* matching elements, no match for zero */ > } else if (first_equal < first_zero) { > return 2; /* matching elements before match for zero */ > } > return 0; /* match for zero */ > } The rest of this looks good. r~
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
David Hildenbrand writes: > On 22.05.19 13:09, Richard Henderson wrote: >> On 5/22/19 7:01 AM, David Hildenbrand wrote: >>> I also think that, if we create a bunch more of these wrappers: > +DEF_VFAE_HELPER(8) > +DEF_VFAE_HELPER(16) > +DEF_VFAE_HELPER(32) then RT and ZS can be passed in as constant parameters to the above, and then the compiler will fold away all of the stuff that's not needed for each different case. Which, I think, is significant. These are practically different instructions with the different modifiers. >>> >>> So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ... >>> 48 helpers in total. Do we really want to go down that path? >> >> Maybe? > > Hope my fingers won't bleed from all the copy-pasting ;) An alternative is to generalise the code into a helper and then just use macros to instantiate a series of calls to it (c.f. softfloat). The idea is you can use flatten/inline to keep it efficient but you don't have a bunch of logic obscured by macro stuff. -- Alex Bennée
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 22.05.19 20:46, Richard Henderson wrote: > On 5/22/19 2:16 PM, David Hildenbrand wrote: >> On 22.05.19 17:59, Richard Henderson wrote: >>> On Wed, 22 May 2019 at 07:16, David Hildenbrand wrote: > Also plausible. I guess it would be good to know, anyway. I'll dump the parameters when booting Linux. My gut feeling is that the cc option is basically never used ... >>> >>> It looks like our intuition is wrong about that. >> >> Thanks for checking! >> >>> >>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l >>> 15 >>> >>> These set cc, use zs, and do not use rt. >>> >>> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l >>> 3 >>> >>> These do not set cc, do not use zs, and do use rt. >>> >>> Those are the only two VFAE forms used by glibc (note that the same >>> variants as 'f' are used by the wide-character strings). >>> >> >> I guess "rt" and "cc" make the biggest difference. Maybe special case >> these two, result in 4 variants for each of the 3 element sizes? > > Sounds good. > So after all it might not be necessary, at least not for this helper :) Using your crazy helper functions, I have this right now: /* * Returns the number of bits composing one element. */ static uint8_t get_element_bits(uint8_t es) { return (1 << es) * BITS_PER_BYTE; } /* * Returns the bitmask for a single element. */ static uint64_t get_single_element_mask(uint8_t es) { return -1ull >> (64 - get_element_bits(es)); } /* * Returns the bitmask for a single element (excluding the MSB). */ static uint64_t get_single_element_lsbs_mask(uint8_t es) { return -1ull >> (65 - get_element_bits(es)); } /* * Returns the bitmasks for multiple elements (excluding the MSBs). */ static uint64_t get_element_lsbs_mask(uint8_t es) { return dup_const(es, get_single_element_lsbs_mask(es)); } static int vfae(void *v1, const void *v2, const void *v3, bool in, bool rt, bool zs, uint8_t es) { const uint64_t mask = get_element_lsbs_mask(es); const int bits = get_element_bits(es); uint64_t a0, a1, b0, b1, e0, e1, t0, t1, z0, z1; uint64_t first_zero = 16; uint64_t first_equal; int i; a0 = s390_vec_read_element64(v2, 0); a1 = s390_vec_read_element64(v2, 1); b0 = s390_vec_read_element64(v3, 0); b1 = s390_vec_read_element64(v3, 1); e0 = 0; e1 = 0; /* compare against equality with every other element */ for (i = 0; i < 64; i += bits) { t0 = i ? rol64(b0, i) : b0; t1 = i ? rol64(b1, i) : b1; e0 |= zero_search(a0 ^ t0, mask); e0 |= zero_search(a0 ^ t1, mask); e1 |= zero_search(a1 ^ t0, mask); e1 |= zero_search(a1 ^ t1, mask); } /* invert the result if requested - invert only the MSBs */ if (in) { e0 = ~e0 & ~mask; e1 = ~e1 & ~mask; } first_equal = match_index(e0, e1); if (zs) { z0 = zero_search(a0, mask); z1 = zero_search(a1, mask); first_zero = match_index(z0, z1); } if (rt) { e0 = (e0 >> (bits - 1)) * get_single_element_mask(es); e1 = (e1 >> (bits - 1)) * get_single_element_mask(es); s390_vec_write_element64(v1, 0, e0); s390_vec_write_element64(v1, 1, e1); } else { s390_vec_write_element64(v1, 0, MIN(first_equal, first_zero)); s390_vec_write_element64(v1, 1, 0); } if (first_zero == 16 && first_equal == 16) { return 3; /* no match */ } else if (first_zero == 16) { return 1; /* matching elements, no match for zero */ } else if (first_equal < first_zero) { return 2; /* matching elements before match for zero */ } return 0; /* match for zero */ } At least the kernel boots with it - am i missing something or does this indeed work? Cheers! -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 5/22/19 2:16 PM, David Hildenbrand wrote: > On 22.05.19 17:59, Richard Henderson wrote: >> On Wed, 22 May 2019 at 07:16, David Hildenbrand wrote: Also plausible. I guess it would be good to know, anyway. >>> >>> I'll dump the parameters when booting Linux. My gut feeling is that the >>> cc option is basically never used ... >> >> It looks like our intuition is wrong about that. > > Thanks for checking! > >> >> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l >> 15 >> >> These set cc, use zs, and do not use rt. >> >> rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l >> 3 >> >> These do not set cc, do not use zs, and do use rt. >> >> Those are the only two VFAE forms used by glibc (note that the same >> variants as 'f' are used by the wide-character strings). >> > > I guess "rt" and "cc" make the biggest difference. Maybe special case > these two, result in 4 variants for each of the 3 element sizes? Sounds good. r~
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 22.05.19 17:59, Richard Henderson wrote: > On Wed, 22 May 2019 at 07:16, David Hildenbrand wrote: >>> Also plausible. I guess it would be good to know, anyway. >> >> I'll dump the parameters when booting Linux. My gut feeling is that the >> cc option is basically never used ... > > It looks like our intuition is wrong about that. Thanks for checking! > > rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l > 15 > > These set cc, use zs, and do not use rt. > > rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l > 3 > > These do not set cc, do not use zs, and do use rt. > > Those are the only two VFAE forms used by glibc (note that the same > variants as 'f' are used by the wide-character strings). > I guess "rt" and "cc" make the biggest difference. Maybe special case these two, result in 4 variants for each of the 3 element sizes? > > r~ > -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On Wed, 22 May 2019 at 07:16, David Hildenbrand wrote: > > Also plausible. I guess it would be good to know, anyway. > > I'll dump the parameters when booting Linux. My gut feeling is that the > cc option is basically never used ... It looks like our intuition is wrong about that. rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r vfaezbs * | wc -l 15 These set cc, use zs, and do not use rt. rth@cloudburst:~/glibc/src/sysdeps/s390$ grep -r 'vfaeb' * | wc -l 3 These do not set cc, do not use zs, and do use rt. Those are the only two VFAE forms used by glibc (note that the same variants as 'f' are used by the wide-character strings). r~
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 22.05.19 13:09, Richard Henderson wrote: > On 5/22/19 7:01 AM, David Hildenbrand wrote: >> >>> I also think that, if we create a bunch more of these wrappers: >>> +DEF_VFAE_HELPER(8) +DEF_VFAE_HELPER(16) +DEF_VFAE_HELPER(32) >>> >>> then RT and ZS can be passed in as constant parameters to the above, and >>> then >>> the compiler will fold away all of the stuff that's not needed for each >>> different case. Which, I think, is significant. These are practically >>> different instructions with the different modifiers. >>> >> >> So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ... >> 48 helpers in total. Do we really want to go down that path? > > Maybe? Hope my fingers won't bleed from all the copy-pasting ;) > >> I can also go ahead any try to identify the most frequent users (in >> Linux) and only specialize that one. > > Also plausible. I guess it would be good to know, anyway. I'll dump the parameters when booting Linux. My gut feeling is that the cc option is basically never used ... > > I think RT probably makes the largest difference to the layout of the > function, Yes. I think the RT and ZS make the biggest difference. IN - not really that heavy. Maybe use different variants for RT and ZS for the !CC casen only. > so maybe that's the one we pick. We could also leave our options open and > make > the 3 non-CC flags be parameters to the inline function, just extract them > from > the M4 parameter at the one higher level. That one, I have already done :) -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 5/22/19 7:01 AM, David Hildenbrand wrote: > >> I also think that, if we create a bunch more of these wrappers: >> >>> +DEF_VFAE_HELPER(8) >>> +DEF_VFAE_HELPER(16) >>> +DEF_VFAE_HELPER(32) >> >> then RT and ZS can be passed in as constant parameters to the above, and then >> the compiler will fold away all of the stuff that's not needed for each >> different case. Which, I think, is significant. These are practically >> different instructions with the different modifiers. >> > > So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ... > 48 helpers in total. Do we really want to go down that path? Maybe? > I can also go ahead any try to identify the most frequent users (in > Linux) and only specialize that one. Also plausible. I guess it would be good to know, anyway. I think RT probably makes the largest difference to the layout of the function, so maybe that's the one we pick. We could also leave our options open and make the 3 non-CC flags be parameters to the inline function, just extract them from the M4 parameter at the one higher level. r~
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
> I also think that, if we create a bunch more of these wrappers: > >> +DEF_VFAE_HELPER(8) >> +DEF_VFAE_HELPER(16) >> +DEF_VFAE_HELPER(32) > > then RT and ZS can be passed in as constant parameters to the above, and then > the compiler will fold away all of the stuff that's not needed for each > different case. Which, I think, is significant. These are practically > different instructions with the different modifiers. > So, we have 4 flags, resulting in 16 variants. Times 3 element sizes ... 48 helpers in total. Do we really want to go down that path? I can also go ahead any try to identify the most frequent users (in Linux) and only specialize that one. -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 17.05.19 18:16, Richard Henderson wrote: > On 5/15/19 1:31 PM, David Hildenbrand wrote: >> +#define DEF_VFAE(BITS) >>\ >> +static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5) >> > > > First, because this *is* complicated stuff, can we find a way to use inline > functions instead of an undebuggable macro for this? Perhaps a different set > of wrappers than s390_vec_read_element##BITS, which always return uint32_t, so > that they have a constant signature? For vfene I have for now +static inline uint64_t s390_vec_read_element(const S390Vector *v, uint8_t enr, + uint8_t es) +{ +switch (es) { +case MO_8: +return s390_vec_read_element8(v, enr); +case MO_16: +return s390_vec_read_element16(v, enr); +case MO_32: +return s390_vec_read_element32(v, enr); +case MO_64: +return s390_vec_read_element64(v, enr); +default: +g_assert_not_reached(); +} +} + Which we could reuse here. I'll try to look into using a inline function instead, passing in the element size and other flags, so the compiler can specialize. Thanks! > >> +if (zs && !data) { >> +if (cc == 3) { >> +first_byte = i * (BITS / 8); >> +cc = 0; /* match for zero */ >> +} else if (cc != 0) { >> +cc = 2; /* matching elements before match for zero */ >> +} >> +if (!rt) { >> +break; >> +} >> +} > > So here we are computing the second intermediate result. > >> +/* try to match with any other element from the other vector */ >> +for (j = 0; j < (128 / BITS); j++) { >> +if (data == s390_vec_read_element##BITS(v3, j)) { >> +any_equal = true; >> +break; >> +} >> +} > > And here the first intermediate result, > >> +/* invert the result if requested */ >> +any_equal = in ^ any_equal; > > ... inverted, if requested, > >> +if (cc == 3 && any_equal) { >> +first_byte = i * (BITS / 8); >> +cc = 1; /* matching elements, no match for zero */ >> +if (!zs && !rt) { >> +break; >> +} >> +} > >> +/* indicate bit vector if requested */ >> +if (rt && any_equal) { >> +s390_vec_write_element##BITS(&tmp, i, (uint##BITS##_t)-1ull); >> +} > > ... writing out (some of) the results of the first intermediate result. > >> +} >> +if (!rt) { >> +s390_vec_write_element8(&tmp, 7, first_byte); >> +} > > ... writing out the rest of the first intermediate result. > > I wonder if it wouldn't be clearer, within the loop, to do > > if (any_equal) { > if (cc == 3) { > first_byte = ...; > cc = 1; > } > if (rt) { > write element -1; > } else if (!zs) { > break; > } > } > > I also think that, if we create a bunch more of these wrappers: > >> +DEF_VFAE_HELPER(8) >> +DEF_VFAE_HELPER(16) >> +DEF_VFAE_HELPER(32) > > then RT and ZS can be passed in as constant parameters to the above, and then > the compiler will fold away all of the stuff that's not needed for each > different case. Which, I think, is significant. These are practically > different instructions with the different modifiers. > > > r~ > -- Thanks, David / dhildenb
Re: [Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
On 5/15/19 1:31 PM, David Hildenbrand wrote: > +#define DEF_VFAE(BITS) > \ > +static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5) > First, because this *is* complicated stuff, can we find a way to use inline functions instead of an undebuggable macro for this? Perhaps a different set of wrappers than s390_vec_read_element##BITS, which always return uint32_t, so that they have a constant signature? > +if (zs && !data) { > +if (cc == 3) { > +first_byte = i * (BITS / 8); > +cc = 0; /* match for zero */ > +} else if (cc != 0) { > +cc = 2; /* matching elements before match for zero */ > +} > +if (!rt) { > +break; > +} > +} So here we are computing the second intermediate result. > +/* try to match with any other element from the other vector */ > +for (j = 0; j < (128 / BITS); j++) { > +if (data == s390_vec_read_element##BITS(v3, j)) { > +any_equal = true; > +break; > +} > +} And here the first intermediate result, > +/* invert the result if requested */ > +any_equal = in ^ any_equal; ... inverted, if requested, > +if (cc == 3 && any_equal) { > +first_byte = i * (BITS / 8); > +cc = 1; /* matching elements, no match for zero */ > +if (!zs && !rt) { > +break; > +} > +} > +/* indicate bit vector if requested */ > +if (rt && any_equal) { > +s390_vec_write_element##BITS(&tmp, i, (uint##BITS##_t)-1ull); > +} ... writing out (some of) the results of the first intermediate result. > +} > +if (!rt) { > +s390_vec_write_element8(&tmp, 7, first_byte); > +} ... writing out the rest of the first intermediate result. I wonder if it wouldn't be clearer, within the loop, to do if (any_equal) { if (cc == 3) { first_byte = ...; cc = 1; } if (rt) { write element -1; } else if (!zs) { break; } } I also think that, if we create a bunch more of these wrappers: > +DEF_VFAE_HELPER(8) > +DEF_VFAE_HELPER(16) > +DEF_VFAE_HELPER(32) then RT and ZS can be passed in as constant parameters to the above, and then the compiler will fold away all of the stuff that's not needed for each different case. Which, I think, is significant. These are practically different instructions with the different modifiers. r~
[Qemu-devel] [PATCH v1 1/5] s390x/tcg: Implement VECTOR FIND ANY ELEMENT EQUAL
Complicated stuff. Provide two variants, one for the CC and one without the CC. The CC is returned via cpu_env. Signed-off-by: David Hildenbrand --- target/s390x/Makefile.objs | 2 +- target/s390x/helper.h| 8 +++ target/s390x/insn-data.def | 5 ++ target/s390x/translate_vx.inc.c | 31 ++ target/s390x/vec_string_helper.c | 97 5 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 target/s390x/vec_string_helper.c diff --git a/target/s390x/Makefile.objs b/target/s390x/Makefile.objs index 993ac93ed6..0a38281a14 100644 --- a/target/s390x/Makefile.objs +++ b/target/s390x/Makefile.objs @@ -1,7 +1,7 @@ obj-y += cpu.o cpu_models.o cpu_features.o gdbstub.o interrupt.o helper.o obj-$(CONFIG_TCG) += translate.o cc_helper.o excp_helper.o fpu_helper.o obj-$(CONFIG_TCG) += int_helper.o mem_helper.o misc_helper.o crypto_helper.o -obj-$(CONFIG_TCG) += vec_helper.o vec_int_helper.o +obj-$(CONFIG_TCG) += vec_helper.o vec_int_helper.o vec_string_helper.o obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o mmu_helper.o diag.o obj-$(CONFIG_SOFTMMU) += sigp.o obj-$(CONFIG_KVM) += kvm.o diff --git a/target/s390x/helper.h b/target/s390x/helper.h index 7755a96c33..c45328cf73 100644 --- a/target/s390x/helper.h +++ b/target/s390x/helper.h @@ -211,6 +211,14 @@ DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32) +/* === Vector String Instructions === */ +DEF_HELPER_FLAGS_4(gvec_vfae8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_FLAGS_4(gvec_vfae16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_FLAGS_4(gvec_vfae32, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32) +DEF_HELPER_5(gvec_vfae_cc8, void, ptr, cptr, cptr, env, i32) +DEF_HELPER_5(gvec_vfae_cc16, void, ptr, cptr, cptr, env, i32) +DEF_HELPER_5(gvec_vfae_cc32, void, ptr, cptr, cptr, env, i32) + #ifndef CONFIG_USER_ONLY DEF_HELPER_3(servc, i32, env, i64, i64) DEF_HELPER_4(diag, void, env, i32, i32, i32) diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def index e61475bdc4..070ce2a471 100644 --- a/target/s390x/insn-data.def +++ b/target/s390x/insn-data.def @@ -1191,6 +1191,11 @@ /* VECTOR TEST UNDER MASK */ F(0xe7d8, VTM, VRR_a, V, 0, 0, 0, 0, vtm, 0, IF_VEC) +/* === Vector String Instructions === */ + +/* VECTOR FIND ANY ELEMENT EQUAL */ +F(0xe782, VFAE,VRR_b, V, 0, 0, 0, 0, vfae, 0, IF_VEC) + #ifndef CONFIG_USER_ONLY /* COMPARE AND SWAP AND PURGE */ E(0xb250, CSP, RRE, Z, r1_32u, ra2, r1_P, 0, csp, 0, MO_TEUL, IF_PRIV) diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c index 7e0bfcb190..022990dda3 100644 --- a/target/s390x/translate_vx.inc.c +++ b/target/s390x/translate_vx.inc.c @@ -2353,3 +2353,34 @@ static DisasJumpType op_vtm(DisasContext *s, DisasOps *o) set_cc_static(s); return DISAS_NEXT; } + +static DisasJumpType op_vfae(DisasContext *s, DisasOps *o) +{ +const uint8_t es = get_field(s->fields, m4); +const uint8_t m5 = get_field(s->fields, m5); +static gen_helper_gvec_3_ptr * const cc[3] = { +gen_helper_gvec_vfae_cc8, +gen_helper_gvec_vfae_cc16, +gen_helper_gvec_vfae_cc32, +}; +static gen_helper_gvec_3 * const nocc[3] = { +gen_helper_gvec_vfae8, +gen_helper_gvec_vfae16, +gen_helper_gvec_vfae32, +}; + +if (es > ES_32) { +gen_program_exception(s, PGM_SPECIFICATION); +return DISAS_NORETURN; +} + +if (m5 & 1) { +gen_gvec_3_ptr(get_field(s->fields, v1), get_field(s->fields, v2), + get_field(s->fields, v3), cpu_env, m5, cc[es]); +set_cc_static(s); +} else { +gen_gvec_3_ool(get_field(s->fields, v1), get_field(s->fields, v2), + get_field(s->fields, v3), m5, nocc[es]); +} +return DISAS_NEXT; +} diff --git a/target/s390x/vec_string_helper.c b/target/s390x/vec_string_helper.c new file mode 100644 index 00..8a4e65b70f --- /dev/null +++ b/target/s390x/vec_string_helper.c @@ -0,0 +1,97 @@ +/* + * QEMU TCG support -- s390x vector string instruction support + * + * Copyright (C) 2019 Red Hat Inc + * + * Authors: + * David Hildenbrand + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "cpu.h" +#include "internal.h" +#include "vec.h" +#include "tcg/tcg-gvec-desc.h" +#include "exec/helper-proto.h" + +#define DEF_VFAE(BITS) \ +static int vfae##BITS(void *v1, const void *v2, const void *v3, uint8_t m5) \ +{ \ +const bool in = extract32(m5, 3, 1);