Hello! Attached patch enhances the fix for PR target/48678 to generate pinsr{q,w,d,q} insn when value is inserted into vector register.
2011-04-20 Uros Bizjak <ubiz...@gmail.com> PR target/48678 * config/i386/i386.md (insv): Change operand 0 constraint to "register_operand". Change operand 1 and 2 constraint to "const_int_operand". Expand to pinsr{b,w,d,q} * when appropriate. * config/i386/sse.md (sse4_1_pinsrb): Export. (sse2_pinsrw): Ditto. (sse4_1_pinsrd): Ditto. (sse4_1_pinsrq): Ditto. * config/i386/i386-protos.h (ix86_expand_pinsr): Add prototype. * config/i386/i386.c (ix86_expand_pinsr): New. testsuite/ChangeLog: 2011-04-20 Uros Bizjak <ubiz...@gmail.com> PR target/48678 * gcc.target/i386/sse2-pinsrw.c: New test. * gcc.target/i386/avx-vpinsrw.c: Ditto. * gcc.target/i386/sse4_1-insvqi.c: Ditto. * gcc.target/i386/sse2-insvhi.c: Ditto. * gcc.target/i386/sse4_1-insvsi.c: Ditto. * gcc.target/i386/sse4_1-insvdi.c: Ditto. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {-m32}. Committed to mainline SVN. Uros.
Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 172780) +++ config/i386/i386.md (working copy) @@ -10393,14 +10393,17 @@ }) (define_expand "insv" - [(set (zero_extract (match_operand 0 "ext_register_operand" "") - (match_operand 1 "const8_operand" "") - (match_operand 2 "const8_operand" "")) + [(set (zero_extract (match_operand 0 "register_operand" "") + (match_operand 1 "const_int_operand" "") + (match_operand 2 "const_int_operand" "")) (match_operand 3 "register_operand" ""))] "" { rtx (*gen_mov_insv_1) (rtx, rtx); + if (ix86_expand_pinsr (operands)) + DONE; + /* Handle insertions to %ah et al. */ if (INTVAL (operands[1]) != 8 || INTVAL (operands[2]) != 8) FAIL; Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 172780) +++ config/i386/sse.md (working copy) @@ -6051,7 +6051,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "*sse4_1_pinsrb" +(define_insn "sse4_1_pinsrb" [(set (match_operand:V16QI 0 "register_operand" "=x,x,x,x") (vec_merge:V16QI (vec_duplicate:V16QI @@ -6083,7 +6083,7 @@ (set_attr "prefix" "orig,orig,vex,vex") (set_attr "mode" "TI")]) -(define_insn "*sse2_pinsrw" +(define_insn "sse2_pinsrw" [(set (match_operand:V8HI 0 "register_operand" "=x,x,x,x") (vec_merge:V8HI (vec_duplicate:V8HI @@ -6117,7 +6117,7 @@ (set_attr "mode" "TI")]) ;; It must come before sse2_loadld since it is preferred. -(define_insn "*sse4_1_pinsrd" +(define_insn "sse4_1_pinsrd" [(set (match_operand:V4SI 0 "register_operand" "=x,x") (vec_merge:V4SI (vec_duplicate:V4SI @@ -6145,7 +6145,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "*sse4_1_pinsrq" +(define_insn "sse4_1_pinsrq" [(set (match_operand:V2DI 0 "register_operand" "=x,x") (vec_merge:V2DI (vec_duplicate:V2DI Index: config/i386/i386-protos.h =================================================================== --- config/i386/i386-protos.h (revision 172780) +++ config/i386/i386-protos.h (working copy) @@ -203,6 +203,7 @@ extern void ix86_expand_vector_extract ( extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); +extern bool ix86_expand_pinsr (rtx *); /* In i386-c.c */ extern void ix86_target_macros (void); Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 172780) +++ config/i386/i386.c (working copy) @@ -34106,6 +34106,88 @@ ix86_expand_vec_extract_even_odd (rtx ta /* ... or we use the special-case patterns. */ expand_vec_perm_even_odd_1 (&d, odd); } + +/* Expand an insert into a vector register through pinsr insn. + Return true if successful. */ + +bool +ix86_expand_pinsr (rtx *operands) +{ + rtx dst = operands[0]; + rtx src = operands[3]; + + unsigned int size = INTVAL (operands[1]); + unsigned int pos = INTVAL (operands[2]); + + if (GET_CODE (dst) == SUBREG) + { + pos += SUBREG_BYTE (dst) * BITS_PER_UNIT; + dst = SUBREG_REG (dst); + } + + if (GET_CODE (src) == SUBREG) + src = SUBREG_REG (src); + + switch (GET_MODE (dst)) + { + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + { + enum machine_mode srcmode, dstmode; + rtx (*pinsr)(rtx, rtx, rtx, rtx); + + srcmode = mode_for_size (size, MODE_INT, 0); + + switch (srcmode) + { + case QImode: + if (!TARGET_SSE4_1) + return false; + dstmode = V16QImode; + pinsr = gen_sse4_1_pinsrb; + break; + + case HImode: + if (!TARGET_SSE2) + return false; + dstmode = V8HImode; + pinsr = gen_sse2_pinsrw; + break; + + case SImode: + if (!TARGET_SSE4_1) + return false; + dstmode = V4SImode; + pinsr = gen_sse4_1_pinsrd; + break; + + case DImode: + gcc_assert (TARGET_64BIT); + if (!TARGET_SSE4_1) + return false; + dstmode = V2DImode; + pinsr = gen_sse4_1_pinsrq; + break; + + default: + return false; + } + + dst = gen_lowpart (dstmode, dst); + src = gen_lowpart (srcmode, src); + + pos /= size; + + emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos))); + return true; + } + + default: + return false; + } +} /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ Index: testsuite/gcc.target/i386/sse2-pinsrw.c =================================================================== --- testsuite/gcc.target/i386/sse2-pinsrw.c (revision 0) +++ testsuite/gcc.target/i386/sse2-pinsrw.c (revision 0) @@ -0,0 +1,86 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-options "-O2 -msse2" } */ + +#ifndef CHECK_H +#define CHECK_H "sse2-check.h" +#endif + +#ifndef TEST +#define TEST sse2_test +#endif + +#include CHECK_H + +#include <emmintrin.h> +#include <string.h> + +#define msk0 0x00 +#define msk1 0x01 +#define msk2 0x02 +#define msk3 0x03 +#define msk4 0x04 +#define msk5 0x05 +#define msk6 0x06 +#define msk7 0x07 + +static void +TEST (void) +{ + union + { + __m128i x; + unsigned int i[4]; + unsigned short s[8]; + } res [8], val, tmp; + int masks[8]; + unsigned short ins[4] = { 3, 4, 5, 6 }; + int i; + + val.i[0] = 0x35251505; + val.i[1] = 0x75655545; + val.i[2] = 0xB5A59585; + val.i[3] = 0xF5E5D5C5; + + /* Check pinsrw imm8, r32, xmm. */ + res[0].x = _mm_insert_epi16 (val.x, ins[0], msk0); + res[1].x = _mm_insert_epi16 (val.x, ins[0], msk1); + res[2].x = _mm_insert_epi16 (val.x, ins[0], msk2); + res[3].x = _mm_insert_epi16 (val.x, ins[0], msk3); + res[4].x = _mm_insert_epi16 (val.x, ins[0], msk4); + res[5].x = _mm_insert_epi16 (val.x, ins[0], msk5); + res[6].x = _mm_insert_epi16 (val.x, ins[0], msk6); + res[7].x = _mm_insert_epi16 (val.x, ins[0], msk7); + + masks[0] = msk0; + masks[1] = msk1; + masks[2] = msk2; + masks[3] = msk3; + masks[4] = msk4; + masks[5] = msk5; + masks[6] = msk6; + masks[7] = msk7; + + for (i = 0; i < 8; i++) + { + tmp.x = val.x; + tmp.s[masks[i]] = ins[0]; + if (memcmp (&tmp, &res[i], sizeof (tmp))) + abort (); + } + + /* Check pinsrw imm8, m16, xmm. */ + for (i = 0; i < 8; i++) + { + res[i].x = _mm_insert_epi16 (val.x, ins[i % 2], msk0); + masks[i] = msk0; + } + + for (i = 0; i < 8; i++) + { + tmp.x = val.x; + tmp.s[masks[i]] = ins[i % 2]; + if (memcmp (&tmp, &res[i], sizeof (tmp))) + abort (); + } +} Index: testsuite/gcc.target/i386/sse4_1-insvdi.c =================================================================== --- testsuite/gcc.target/i386/sse4_1-insvdi.c (revision 0) +++ testsuite/gcc.target/i386/sse4_1-insvdi.c (revision 0) @@ -0,0 +1,55 @@ +/* { dg-do run } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -msse4.1" } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_1-check.h" +#endif + +#ifndef TEST +#define TEST sse4_1_test +#endif + +#include CHECK_H + +#include <smmintrin.h> +#include <string.h> + +typedef long T __attribute__((may_alias)); +struct S { __m128i d; }; + +__m128i +__attribute__((noinline)) +foo (__m128i y, long x) +{ + struct S s; + + s.d = y; + ((T *) &s.d)[1] = x; + return s.d; +} + +static void +TEST (void) +{ + union + { + __m128i x; + unsigned int i[4]; + unsigned long l[2]; + } res, val, tmp; + unsigned long ins[4] = { 3, 4, 5, 6 }; + + val.i[0] = 0x35251505; + val.i[1] = 0x75655545; + val.i[2] = 0xB5A59585; + val.i[3] = 0xF5E5D5C5; + + res.x = foo (val.x, ins[3]); + + tmp.x = val.x; + tmp.l[1] = ins[3]; + if (memcmp (&tmp, &res, sizeof (tmp))) + abort (); +} Index: testsuite/gcc.target/i386/sse4_1-insvqi.c =================================================================== --- testsuite/gcc.target/i386/sse4_1-insvqi.c (revision 0) +++ testsuite/gcc.target/i386/sse4_1-insvqi.c (revision 0) @@ -0,0 +1,54 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -msse4.1" } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_1-check.h" +#endif + +#ifndef TEST +#define TEST sse4_1_test +#endif + +#include CHECK_H + +#include <smmintrin.h> +#include <string.h> + +typedef char T __attribute__((may_alias)); +struct S { __m128i d; }; + +__m128i +__attribute__((noinline)) +foo (__m128i y, char x) +{ + struct S s; + + s.d = y; + ((T *) &s.d)[1] = x; + return s.d; +} + +static void +TEST (void) +{ + union + { + __m128i x; + unsigned int i[4]; + unsigned char c[16]; + } res, val, tmp; + unsigned char ins[4] = { 3, 4, 5, 6 }; + + val.i[0] = 0x35251505; + val.i[1] = 0x75655545; + val.i[2] = 0xB5A59585; + val.i[3] = 0xF5E5D5C5; + + res.x = foo (val.x, ins[3]); + + tmp.x = val.x; + tmp.c[1] = ins[3]; + if (memcmp (&tmp, &res, sizeof (tmp))) + abort (); +} Index: testsuite/gcc.target/i386/sse2-insvhi.c =================================================================== --- testsuite/gcc.target/i386/sse2-insvhi.c (revision 0) +++ testsuite/gcc.target/i386/sse2-insvhi.c (revision 0) @@ -0,0 +1,54 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-options "-O2 -msse2" } */ + +#ifndef CHECK_H +#define CHECK_H "sse2-check.h" +#endif + +#ifndef TEST +#define TEST sse2_test +#endif + +#include CHECK_H + +#include <emmintrin.h> +#include <string.h> + +typedef short T __attribute__((may_alias)); +struct S { __m128i d; }; + +__m128i +__attribute__((noinline)) +foo (__m128i y, short x) +{ + struct S s; + + s.d = y; + ((T *) &s.d)[1] = x; + return s.d; +} + +static void +TEST (void) +{ + union + { + __m128i x; + unsigned int i[4]; + unsigned short s[8]; + } res, val, tmp; + unsigned short ins[4] = { 3, 4, 5, 6 }; + + val.i[0] = 0x35251505; + val.i[1] = 0x75655545; + val.i[2] = 0xB5A59585; + val.i[3] = 0xF5E5D5C5; + + res.x = foo (val.x, ins[3]); + + tmp.x = val.x; + tmp.s[1] = ins[3]; + if (memcmp (&tmp, &res, sizeof (tmp))) + abort (); +} Index: testsuite/gcc.target/i386/avx-vpinsrw-1.c =================================================================== --- testsuite/gcc.target/i386/avx-vpinsrw-1.c (revision 0) +++ testsuite/gcc.target/i386/avx-vpinsrw-1.c (revision 0) @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O2 -mavx" } */ + +#define CHECK_H "avx-check.h" +#define TEST avx_test + +#include "sse2-pinsrw.c" Index: testsuite/gcc.target/i386/sse4_1-insvsi.c =================================================================== --- testsuite/gcc.target/i386/sse4_1-insvsi.c (revision 0) +++ testsuite/gcc.target/i386/sse4_1-insvsi.c (revision 0) @@ -0,0 +1,53 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -msse4.1" } */ + +#ifndef CHECK_H +#define CHECK_H "sse4_1-check.h" +#endif + +#ifndef TEST +#define TEST sse4_1_test +#endif + +#include CHECK_H + +#include <smmintrin.h> +#include <string.h> + +typedef int T __attribute__((may_alias)); +struct S { __m128i d; }; + +__m128i +__attribute__((noinline)) +foo (__m128i y, int x) +{ + struct S s; + + s.d = y; + ((T *) &s.d)[1] = x; + return s.d; +} + +static void +TEST (void) +{ + union + { + __m128i x; + unsigned int i[4]; + } res, val, tmp; + unsigned int ins[4] = { 3, 4, 5, 6 }; + + val.i[0] = 0x35251505; + val.i[1] = 0x75655545; + val.i[2] = 0xB5A59585; + val.i[3] = 0xF5E5D5C5; + + res.x = foo (val.x, ins[3]); + + tmp.x = val.x; + tmp.i[1] = ins[3]; + if (memcmp (&tmp, &res, sizeof (tmp))) + abort (); +}