Hi!
I think it is at least more readable and perhaps for some CPUs could
be faster (for SandyBridge it is the same speed) if we emit a more
specialized insn over a more generic one.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
In the attachment is my first attempt to do this, in the expander,
unfortunately that turned out to be pessimizing - seems like IRA or
reload has issues with the subregs and on
#include <immintrin.h>
#include <stdio.h>
__m256i a, b, c, d, e, f;
__attribute__((noinline, noclone)) void
f1 (void)
{
a = _mm256_permute2f128_si256 (e, f, 0x12);
b = _mm256_permute2f128_si256 (e, f, 0x20);
}
both vinsert* insns were using a memory operand instead of
loading it into a register first (as done in vanilla gcc as well
as with the patch right below).
2011-11-07 Jakub Jelinek <[email protected]>
* config/i386/sse.md (*avx_vperm2f128<mode>3_nozero): Emit mask
0x12 and 0x20 as vinsert[fi]128 instead of vperm2[fi]128.
--- gcc/config/i386/sse.md.jj 2011-11-07 12:40:55.000000000 +0100
+++ gcc/config/i386/sse.md 2011-11-07 17:50:37.000000000 +0100
@@ -12073,6 +12073,10 @@ (define_insn "*avx_vperm2f128<mode>_noze
&& avx_vperm2f128_parallel (operands[3], <MODE>mode)"
{
int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
+ if (mask == 0x12)
+ return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
+ if (mask == 0x20)
+ return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
operands[3] = GEN_INT (mask);
return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
}
Jakub
2011-11-07 Jakub Jelinek <[email protected]>
* config/i386/sse.md (avx_vperm2f128<mode>3): Emit vinsert[fi]128
for mask 0x12 or 0x20.
--- gcc/config/i386/sse.md.jj 2011-11-07 12:40:55.000000000 +0100
+++ gcc/config/i386/sse.md 2011-11-07 16:40:47.000000000 +0100
@@ -12019,6 +12019,18 @@ (define_expand "avx_vperm2f128<mode>3"
"TARGET_AVX"
{
int mask = INTVAL (operands[3]);
+ if (mask == 0x12 || mask == 0x20)
+ {
+ /* Optimize these two using vinsert[fi]128. */
+ operands[2] = gen_lowpart (<ssehalfvecmode>mode, operands[2]);
+ if (mask == 0x12)
+ emit_insn (gen_vec_set_lo_<mode> (operands[0], operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_vec_set_hi_<mode> (operands[0], operands[1],
+ operands[2]));
+ DONE;
+ }
if ((mask & 0x88) == 0)
{
rtx perm[<ssescalarnum>], t1, t2;