On Thu, Nov 29, 2018 at 05:26:41PM +0100, Uros Bizjak wrote: > On Thu, Nov 29, 2018 at 3:36 PM Jakub Jelinek <ja...@redhat.com> wrote: > > Like blend, movmsk also only cares about the most significant bit, > > so prior < 0 comparisons or (happens also on the testcase below in some > > cases) arithmetic shift right (by any value) isn't needed before the movmsk. > > > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > Same comment as with your lt+blend -> blend patch. I think that > pre-reload define_insn_and_split that splits the combination to movmsk > would be better here. We already implement similar approach to remove > useless maskings of shift operands (c.f. various "..._mask" insns in > i386.md).
So like this? Bootstrapped/regtested on x86_64-linux and i686-linux. Ok for trunk? 2018-11-29 Jakub Jelinek <ja...@redhat.com> PR target/88152 * config/i386/sse.md (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt, *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt, *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift, *<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift, *<sse2_avx2>_pmovmskb_lt, *<sse2_avx2>_pmovmskb_zext_lt): New define_insn_and_split patterns. * g++.target/i386/pr88152.C: New test. --- gcc/config/i386/sse.md.jj 2018-11-29 18:52:42.747904630 +0100 +++ gcc/config/i386/sse.md 2018-11-29 19:21:44.371143252 +0100 @@ -14653,6 +14653,78 @@ (define_insn "*<sse>_movmsk<ssemodesuffi (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(lt:VF_128_256 + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:<sseintvecmode> 2 "const0_operand" "C"))] + UNSPEC_MOVMSK))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (unspec:SI + [(lt:VF_128_256 + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:<sseintvecmode> 2 "const0_operand" "C"))] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(subreg:VF_128_256 + (ashiftrt:<sseintvecmode> + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:QI 2 "const_int_operand" "n")) 0)] + UNSPEC_MOVMSK))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (unspec:SI + [(subreg:VF_128_256 + (ashiftrt:<sseintvecmode> + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:QI 2 "const_int_operand" "n")) 0)] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + (define_insn "<sse2_avx2>_pmovmskb" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI @@ -14680,6 +14752,49 @@ (define_insn "*<sse2_avx2>_pmovmskb_zext [(set_attr "type" "ssemov") (set (attr "prefix_data16") (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*<sse2_avx2>_pmovmskb_lt" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x") + (match_operand:VI1_AVX2 2 "const0_operand" "C"))] + UNSPEC_MOVMSK))] + "TARGET_SSE2" + "#" + "" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*<sse2_avx2>_pmovmskb_zext_lt" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (unspec:SI + [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x") + (match_operand:VI1_AVX2 2 "const0_operand" "C"))] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE2" + "#" + "" + [(set (match_dup 0) + (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else (match_test "TARGET_AVX") (const_string "*") (const_string "1"))) --- gcc/testsuite/g++.target/i386/pr88152.C.jj 2018-11-29 19:12:32.375248556 +0100 +++ gcc/testsuite/g++.target/i386/pr88152.C 2018-11-29 19:12:32.374248573 +0100 @@ -0,0 +1,44 @@ +// PR target/88152 +// { dg-do compile } +// { dg-options "-O2 -mavx2 -std=c++11" } +// { dg-final { scan-assembler-times "vpmovmskb\[^\n\r]*xmm" 6 } } +// { dg-final { scan-assembler-times "vpmovmskb\[^\n\r]*ymm" 6 } } +// { dg-final { scan-assembler-times "vmovmskps\[^\n\r]*xmm" 4 } } +// { dg-final { scan-assembler-times "vmovmskps\[^\n\r]*ymm" 4 } } +// { dg-final { scan-assembler-times "vmovmskpd\[^\n\r]*xmm" 4 } } +// { dg-final { scan-assembler-times "vmovmskpd\[^\n\r]*ymm" 4 } } +// { dg-final { scan-assembler-not "vpcmpgt|vpcmpeq|vpsra" } } + +#include <x86intrin.h> + +template <typename T, size_t N> +using V [[gnu::vector_size(N)]] = T; + +int f0 (V<unsigned char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a > 0x7f)); } +long int f1 (V<unsigned char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a >= 0x80)); } +long int f2 (V<signed char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a < 0)); } +int f3 (V<signed char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a <= -1)); } +int f4 (V<char, 16> a) { return _mm_movemask_epi8 (reinterpret_cast<__m128i> (a < 0)); } +long int f5 (V<char, 16> a) { return (unsigned) _mm_movemask_epi8 (reinterpret_cast<__m128i> (a <= -1)); } +int f6 (V<unsigned int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a > __INT_MAX__)); } +int f7 (V<unsigned int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a >= 1U + __INT_MAX__)); } +int f8 (V<int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a < 0)); } +int f9 (V<int, 16> a) { return _mm_movemask_ps (reinterpret_cast<__m128> (a <= -1)); } +int f10 (V<unsigned long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a > __LONG_LONG_MAX__)); } +int f11 (V<unsigned long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a >= 1ULL + __LONG_LONG_MAX__)); } +long int f12 (V<long long, 16> a) { return (unsigned) _mm_movemask_pd (reinterpret_cast<__m128d> (a < 0)); } +int f13 (V<long long, 16> a) { return _mm_movemask_pd (reinterpret_cast<__m128d> (a <= -1)); } +int f14 (V<unsigned char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a > 0x7f)); } +int f15 (V<unsigned char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a >= 0x80)); } +long int f16 (V<signed char, 32> a) { return (unsigned) _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a < 0)); } +int f17 (V<signed char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a <= -1)); } +int f18 (V<char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a < 0)); } +int f19 (V<char, 32> a) { return _mm256_movemask_epi8 (reinterpret_cast<__m256i> (a <= -1)); } +long int f20 (V<unsigned int, 32> a) { return (unsigned) _mm256_movemask_ps (reinterpret_cast<__m256> (a > __INT_MAX__)); } +int f21 (V<unsigned int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a >= 1U + __INT_MAX__)); } +int f22 (V<int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a < 0)); } +int f23 (V<int, 32> a) { return _mm256_movemask_ps (reinterpret_cast<__m256> (a <= -1)); } +int f24 (V<unsigned long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a > __LONG_LONG_MAX__)); } +int f25 (V<unsigned long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a >= 1ULL + __LONG_LONG_MAX__)); } +int f26 (V<long long, 32> a) { return _mm256_movemask_pd (reinterpret_cast<__m256d> (a < 0)); } +long int f27 (V<long long, 32> a) { return (unsigned) _mm256_movemask_pd (reinterpret_cast<__m256d> (a <= -1)); } Jakub