Hi! Yesterday I've noticed that for AVX which allows unaligned operands in AVX arithmetics instructions we still don't combine unaligned loads with the AVX arithmetics instructions. So say for -O2 -mavx -ftree-vectorize void f1 (int *__restrict e, int *__restrict f) { int i; for (i = 0; i < 1024; i++) e[i] = f[i] * 7; }
void f2 (int *__restrict e, int *__restrict f) { int i; for (i = 0; i < 1024; i++) e[i] = f[i]; } we have: vmovdqu (%rsi,%rax), %xmm0 vpmulld %xmm1, %xmm0, %xmm0 vmovups %xmm0, (%rdi,%rax) in the first loop. Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT *mov<mode>_internal patterns (and various others) use misaligned_operand to see if they should emit vmovaps or vmovups (etc.), so as suggested by Richard on IRC it isn't necessary to either allow UNSPEC_LOADU in memory operands of all the various non-move AVX instructions for TARGET_AVX, or add extra patterns to help combine, this patch instead just uses the *mov<mode>_internal in that case (assuming initially misaligned_operand doesn't become !misaligned_operand through RTL optimizations). Additionally the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned loads, which usually means combine will fail, by doing the load into a temporary pseudo in that case and then doing a pseudo to pseudo move with gen_lowpart on the rhs (which will be merged soon after into following instructions). I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my bootstrap/regtest server isn't AVX capable. 2013-10-30 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If op1 is misaligned_operand, just use *mov<mode>_internal insn rather than UNSPEC_LOADU load. (ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only). Avoid gen_lowpart on op0 if it isn't MEM. --- gcc/config/i386/i386.c.jj 2013-10-30 08:15:38.000000000 +0100 +++ gcc/config/i386/i386.c 2013-10-30 10:20:22.684708729 +0100 @@ -16560,6 +16560,12 @@ ix86_avx256_split_vector_move_misalign ( r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); emit_move_insn (op0, r); } + /* Normal *mov<mode>_internal pattern will handle + unaligned loads just fine if misaligned_operand + is true, and without the UNSPEC it can be combined + with arithmetic instructions. */ + else if (misaligned_operand (op1, GET_MODE (op1))) + emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); else emit_insn (load_unaligned (op0, op1)); } @@ -16634,7 +16640,7 @@ ix86_avx256_split_vector_move_misalign ( void ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { - rtx op0, op1, m; + rtx op0, op1, orig_op0 = NULL_RTX, m; rtx (*load_unaligned) (rtx, rtx); rtx (*store_unaligned) (rtx, rtx); @@ -16647,7 +16653,16 @@ ix86_expand_vector_move_misalign (enum m { case MODE_VECTOR_INT: case MODE_INT: - op0 = gen_lowpart (V16SImode, op0); + if (GET_MODE (op0) != V16SImode) + { + if (!MEM_P (op0)) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V16SImode); + } + else + op0 = gen_lowpart (V16SImode, op0); + } op1 = gen_lowpart (V16SImode, op1); /* FALLTHRU */ @@ -16676,6 +16691,8 @@ ix86_expand_vector_move_misalign (enum m emit_insn (store_unaligned (op0, op1)); else gcc_unreachable (); + if (orig_op0) + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); break; default: @@ -16692,12 +16709,23 @@ ix86_expand_vector_move_misalign (enum m { case MODE_VECTOR_INT: case MODE_INT: - op0 = gen_lowpart (V32QImode, op0); + if (GET_MODE (op0) != V32QImode) + { + if (!MEM_P (op0)) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V32QImode); + } + else + op0 = gen_lowpart (V32QImode, op0); + } op1 = gen_lowpart (V32QImode, op1); /* FALLTHRU */ case MODE_VECTOR_FLOAT: ix86_avx256_split_vector_move_misalign (op0, op1); + if (orig_op0) + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); break; default: @@ -16709,15 +16737,30 @@ ix86_expand_vector_move_misalign (enum m if (MEM_P (op1)) { + /* Normal *mov<mode>_internal pattern will handle + unaligned loads just fine if misaligned_operand + is true, and without the UNSPEC it can be combined + with arithmetic instructions. */ + if (TARGET_AVX + && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT + || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) + && misaligned_operand (op1, GET_MODE (op1))) + emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); /* ??? If we have typed data, then it would appear that using movdqu is the only way to get unaligned data loaded with integer type. */ - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) { - op0 = gen_lowpart (V16QImode, op0); + if (GET_MODE (op0) != V16QImode) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V16QImode); + } op1 = gen_lowpart (V16QImode, op1); /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_loaddquv16qi (op0, op1)); + if (orig_op0) + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); } else if (TARGET_SSE2 && mode == V2DFmode) { @@ -16765,9 +16808,16 @@ ix86_expand_vector_move_misalign (enum m || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL || optimize_insn_for_size_p ()) { - op0 = gen_lowpart (V4SFmode, op0); + if (GET_MODE (op0) != V4SFmode) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V4SFmode); + } op1 = gen_lowpart (V4SFmode, op1); emit_insn (gen_sse_loadups (op0, op1)); + if (orig_op0) + emit_move_insn (orig_op0, + gen_lowpart (GET_MODE (orig_op0), op0)); return; } Jakub