Hi! As discussed in the PR, this patch similarly to the recent changes in movmisalign expansion for TARGET_AVX for unaligned loads from misaligned_operand just expands those as *mov<mode>_internal pattern, because that pattern emits vmovdqu/vmovup[sd] too, but doesn't contain UNSPECs and thus can be also merged into most other AVX insns that use the load target if those insns accept a memory operand.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2013-12-18 Jakub Jelinek <ja...@redhat.com> PR target/59539 * config/i386/sse.md (<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>, <sse2_avx_avx512f>_loaddqu<mode><mask_name>): New expanders, prefix existing define_insn names with *. * gcc.target/i386/pr59539-1.c: New test. * gcc.target/i386/pr59539-2.c: New test. --- gcc/config/i386/sse.md.jj 2013-12-10 12:43:21.000000000 +0100 +++ gcc/config/i386/sse.md 2013-12-18 11:10:36.428643400 +0100 @@ -912,7 +912,27 @@ (define_expand "movmisalign<mode>" DONE; }) -(define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" +(define_expand "<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "nonimmediate_operand")] + UNSPEC_LOADU))] + "TARGET_SSE && <mask_mode512bit_condition>" +{ + /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads + just fine if misaligned_operand is true, and without the UNSPEC it can + be combined with arithmetic instructions. If misaligned_operand is + false, still emit UNSPEC_LOADU insn to honor user's request for + misaligned load. */ + if (TARGET_AVX + && misaligned_operand (operands[1], <MODE>mode) + && !<mask_applied>) + { + emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF [(match_operand:VF 1 "nonimmediate_operand" "vm")] @@ -999,7 +1019,28 @@ (define_insn "avx512f_storeu<ssemodesuff (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<sse2_avx_avx512f>_loaddqu<mode><mask_name>" +(define_expand "<sse2_avx_avx512f>_loaddqu<mode><mask_name>" + [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand") + (unspec:VI_UNALIGNED_LOADSTORE + [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand")] + UNSPEC_LOADU))] + "TARGET_SSE2 && <mask_mode512bit_condition>" +{ + /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads + just fine if misaligned_operand is true, and without the UNSPEC it can + be combined with arithmetic instructions. If misaligned_operand is + false, still emit UNSPEC_LOADU insn to honor user's request for + misaligned load. */ + if (TARGET_AVX + && misaligned_operand (operands[1], <MODE>mode) + && !<mask_applied>) + { + emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + DONE; + } +}) + +(define_insn "*<sse2_avx_avx512f>_loaddqu<mode><mask_name>" [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand" "=v") (unspec:VI_UNALIGNED_LOADSTORE [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")] --- gcc/testsuite/gcc.target/i386/pr59539-1.c.jj 2013-12-18 08:46:26.023864371 +0100 +++ gcc/testsuite/gcc.target/i386/pr59539-1.c 2013-12-18 08:53:12.304743270 +0100 @@ -0,0 +1,16 @@ +/* PR target/59539 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ + +#include <immintrin.h> + +int +foo (void *p1, void *p2) +{ + __m128i d1 = _mm_loadu_si128 ((__m128i *) p1); + __m128i d2 = _mm_loadu_si128 ((__m128i *) p2); + __m128i result = _mm_cmpeq_epi16 (d1, d2); + return _mm_movemask_epi8 (result); +} + +/* { dg-final { scan-assembler-times "vmovdqu" 1 } } */ --- gcc/testsuite/gcc.target/i386/pr59539-2.c.jj 2013-12-18 08:46:33.130826198 +0100 +++ gcc/testsuite/gcc.target/i386/pr59539-2.c 2013-12-18 08:47:14.890608917 +0100 @@ -0,0 +1,16 @@ +/* PR target/59539 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2" } */ + +#include <immintrin.h> + +int +foo (void *p1, void *p2) +{ + __m256i d1 = _mm256_loadu_si256 ((__m256i *) p1); + __m256i d2 = _mm256_loadu_si256 ((__m256i *) p2); + __m256i result = _mm256_cmpeq_epi16 (d1, d2); + return _mm256_movemask_epi8 (result); +} + +/* { dg-final { scan-assembler-times "vmovdqu" 1 } } */ Jakub