Hello! Attached patch converts non-variable DImode shifts to SSE shifts on 32bit targets.
Please note that the patch doesn't convert variable shifts. We can't just use Qimode register from integer shifts in its SImode to implement SSE shifts. The bits outside QImode can be non-zero, the narrowest mode to copy value from integer to SSE register is SImode, and since SSE shifts truncate for count values outside allowed range, it is possible to truncate shifted value to zero when using count register in a wider mode (SImode). The problem above can be solved by zero-extending the count value from QImode to SImode first, but since we are saving only *one* shift operation (out of two), I think this additional operation won't make the conversion profitable anymore. The patch also converts only the non-variable counts that would otherwise perform two shift operations (e.g. shifts > 31 bits would originally result in one SImode register being zero). The patch noticeably improves compiled assembly from crypto code in libgo and from random generators in libgfortran, resulting in longer STV sequences on 32bit targets. 2016-11-08 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle ASHIFT and LSHIFTRT. (dimode_scalar_chain::compute_convert_gain): Ditto. (dimode_scalar_chain::convert_insn): Ditto. testsuite/ChangeLog: 2016-11-08 Uros Bizjak <ubiz...@gmail.com> * gcc.target/i386/pr70799-2.c: New test. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Committed to mainline SVN. Uros.
Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 241929) +++ config/i386/i386.c (working copy) @@ -2805,11 +2805,24 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *ins switch (GET_CODE (src)) { + case ASHIFT: + case LSHIFTRT: + /* Consider only non-variable shifts narrower + than general register width. */ + if (!(CONST_INT_P (XEXP (src, 1)) + && IN_RANGE (INTVAL (XEXP (src, 1)), 0, 31))) + return false; + break; + case PLUS: case MINUS: case IOR: case XOR: case AND: + if (!REG_P (XEXP (src, 1)) + && !MEM_P (XEXP (src, 1)) + && !CONST_INT_P (XEXP (src, 1))) + return false; break; case REG: @@ -2832,11 +2845,6 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *ins || !REG_P (XEXP (XEXP (src, 0), 0)))) return false; - if (!REG_P (XEXP (src, 1)) - && !MEM_P (XEXP (src, 1)) - && !CONST_INT_P (XEXP (src, 1))) - return false; - if ((GET_MODE (XEXP (src, 0)) != DImode && !CONST_INT_P (XEXP (src, 0))) || (GET_MODE (XEXP (src, 1)) != DImode @@ -3387,6 +3395,13 @@ dimode_scalar_chain::compute_convert_gain () gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; else if (MEM_P (src) && REG_P (dst)) gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1]; + else if (GET_CODE (src) == ASHIFT + || GET_CODE (src) == LSHIFTRT) + { + gain += ix86_cost->add; + if (CONST_INT_P (XEXP (src, 0))) + gain -= vector_const_cost (XEXP (src, 0)); + } else if (GET_CODE (src) == PLUS || GET_CODE (src) == MINUS || GET_CODE (src) == IOR @@ -3738,6 +3753,12 @@ dimode_scalar_chain::convert_insn (rtx_insn *insn) switch (GET_CODE (src)) { + case ASHIFT: + case LSHIFTRT: + convert_op (&XEXP (src, 0), insn); + PUT_MODE (src, V2DImode); + break; + case PLUS: case MINUS: case IOR: Index: testsuite/gcc.target/i386/pr70799-2.c =================================================================== --- testsuite/gcc.target/i386/pr70799-2.c (nonexistent) +++ testsuite/gcc.target/i386/pr70799-2.c (working copy) @@ -0,0 +1,17 @@ +/* PR target/pr70799 */ +/* { dg-do compile { target { ia32 } } } */ +/* { dg-options "-O2 -march=slm -mno-stackrealign" } */ +/* { dg-final { scan-assembler "psllq" } } */ +/* { dg-final { scan-assembler "psrlq" } } */ + +unsigned long long a, b; + +void test1 (void) +{ + a = b << 21; +} + +void test2 (void) +{ + a = b >> 21; +}