hasting2    02/12/02 14:58:17

  Modified:    live/gcc3 GNUmakefile config.guess
               live/gcc3/gcc/config/i386 i386.c xmmintrin.h
  Log:
  SSE fixes for Darwin/x86.  Packaged as gcc3-1301.
  Submitted by: stuart
  Reviewed by: geoff
  
  Revision  Changes    Path
  1.32      +3 -0      src/live/gcc3/GNUmakefile
  
  Index: GNUmakefile
  ===================================================================
  RCS file: /cvs/Darwin/src/live/gcc3/GNUmakefile,v
  retrieving revision 1.31
  retrieving revision 1.32
  diff -u -r1.31 -r1.32
  --- GNUmakefile       2002/08/21 18:07:59     1.31
  +++ GNUmakefile       2002/12/02 22:58:08     1.32
  @@ -208,6 +208,9 @@
              ln -s gcc/darwin/default/$$file1 $(std_include_dir); \
            fi \
        done
  +     case "${TARGETS}" in \
  +         *i386*) install -c -m 444 gcc/config/i386/*mmintrin.h $(gcc_hdr_dir) ;;\
  +     esac
        rm -f /tmp/float.$$$$ && \
        more-hdrs/synthesize-float $(SRC) /tmp/float.$$$$ $(RC_RELEASE) && \
        install -c -m 444 /tmp/float.$$$$ $(gcc_hdr_dir)/float.h && \
  
  
  
  1.8       +6 -2      src/live/gcc3/config.guess
  
  Index: config.guess
  ===================================================================
  RCS file: /cvs/Darwin/src/live/gcc3/config.guess,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- config.guess      2002/08/26 05:39:21     1.7
  +++ config.guess      2002/12/02 22:58:08     1.8
  @@ -3,7 +3,7 @@
   #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
   #   2000, 2001, 2002 Free Software Foundation, Inc.
   
  -timestamp='2002-08-19'
  +timestamp='2002-11-26'
   
   # This file is free software; you can redistribute it and/or modify it
   # under the terms of the GNU General Public License as published by
  @@ -1092,7 +1092,11 @@
        echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
        exit 0 ;;
       *:Darwin:*:*)
  -     echo `uname -p`-apple-darwin${UNAME_RELEASE}
  +     case `uname -p` in
  +         *86) UNAME_PROCESSOR=i686 ;;
  +         powerpc) UNAME_PROCESSOR=powerpc ;;
  +     esac
  +     echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
        exit 0 ;;
       *:procnto*:*:* | *:QNX:[0123456789]*:*)
        UNAME_PROCESSOR=`uname -p`
  
  
  
  1.36      +54 -5     src/live/gcc3/gcc/config/i386/i386.c
  
  Index: i386.c
  ===================================================================
  RCS file: /cvs/Darwin/src/live/gcc3/gcc/config/i386/i386.c,v
  retrieving revision 1.35
  retrieving revision 1.36
  diff -u -r1.35 -r1.36
  --- i386.c    2002/11/20 20:49:20     1.35
  +++ i386.c    2002/12/02 22:58:10     1.36
  @@ -7719,9 +7719,24 @@
         && register_operand (operands[0], mode)
         && CONSTANT_P (operands[1]))
       {
  -      rtx addr = gen_reg_rtx (Pmode);
  -      emit_move_insn (addr, XEXP (force_const_mem (mode, operands[1]), 0));
  -      operands[1] = gen_rtx_MEM (mode, addr);
  +      if (const0_operand (operands[1], mode))
  +     {
  +       rtx clr;
  +       if (MMX_REG_MODE_P (mode))
  +         clr = gen_mmx_clrdi (operands[0]);
  +       else if (SSE_REG_MODE_P (mode))
  +         clr = gen_sse_clrv4sf (operands[0]);
  +       else
  +         abort ();
  +       emit_insn (clr);
  +       return;
  +     }
  +      else
  +     {
  +       rtx addr = gen_reg_rtx (Pmode);
  +       emit_move_insn (addr, XEXP (force_const_mem (mode, operands[1]), 0));
  +       operands[1] = gen_rtx_MEM (mode, addr);
  +     }
       }
   
     /* Make operand1 a register if it isn't already.  */
  @@ -12200,10 +12215,10 @@
     /* @@@ the type is bogus */
     tree v4sf_ftype_v4sf_pv2si
       = build_function_type_list (V4SF_type_node,
  -                             V4SF_type_node, pv2di_type_node, NULL_TREE);
  +                             V4SF_type_node, pv2si_type_node, NULL_TREE);
     tree void_ftype_pv2si_v4sf
       = build_function_type_list (void_type_node,
  -                             pv2di_type_node, V4SF_type_node, NULL_TREE);
  +                             pv2si_type_node, V4SF_type_node, NULL_TREE);
     tree void_ftype_pfloat_v4sf
       = build_function_type_list (void_type_node,
                                pfloat_type_node, V4SF_type_node, NULL_TREE);
  @@ -13242,6 +13257,40 @@
        }
         target = gen_reg_rtx (V2DImode);
         pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), op0, 
op1);
  +      if (! pat)
  +     return 0;
  +      emit_insn (pat);
  +      return target;
  +
  +    case IX86_BUILTIN_PSRLWI128:
  +    case IX86_BUILTIN_PSRLW:
  +    case IX86_BUILTIN_PSRLD:
  +    case IX86_BUILTIN_PSRLQ:
  +      for (i = 0, d = bdesc_2arg;
  +        i < ARRAY_SIZE (bdesc_2arg) && d->code != fcode;
  +        i++, d++)
  +     ;
  +      icode = d->icode;
  +      arg0 = TREE_VALUE (arglist);
  +      arg1 = TREE_VALUE (TREE_CHAIN (arglist));
  +      op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
  +      op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
  +      tmode = insn_data[icode].operand[0].mode;
  +      mode1 = insn_data[icode].operand[1].mode;
  +      mode2 = insn_data[icode].operand[2].mode;
  +
  +      if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
  +     {
  +       op0 = copy_to_reg (op0);
  +       op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
  +     }
  +      if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
  +     {
  +       op1 = copy_to_reg (op1);
  +       op1 = simplify_gen_subreg (mode1, op1, GET_MODE (op1), 0);
  +     }
  +      target = gen_reg_rtx (mode1);
  +      pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, mode1, 0), op0, 
op1);
         if (! pat)
        return 0;
         emit_insn (pat);
  
  
  
  1.2       +1135 -7   src/live/gcc3/gcc/config/i386/xmmintrin.h
  
  Index: xmmintrin.h
  ===================================================================
  RCS file: /cvs/Darwin/src/live/gcc3/gcc/config/i386/xmmintrin.h,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- xmmintrin.h       2002/03/14 03:46:33     1.1
  +++ xmmintrin.h       2002/12/02 22:58:12     1.2
  @@ -30,6 +30,10 @@
   #ifndef _XMMINTRIN_H_INCLUDED
   #define _XMMINTRIN_H_INCLUDED
   
  +#ifndef __SSE__
  +# error "SSE instruction set not enabled"
  +#else
  +
   /* We need type definitions from the MMX header file.  */
   #include <mmintrin.h>
   
  @@ -245,13 +249,21 @@
   static __inline __m128
   _mm_cmpgt_ss (__m128 __A, __m128 __B)
   {
  -  return (__m128) __builtin_ia32_cmpgtss ((__v4sf)__A, (__v4sf)__B);
  +  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
  +                                     (__v4sf)
  +                                     __builtin_ia32_cmpltss ((__v4sf) __B,
  +                                                             (__v4sf)
  +                                                             __A));
   }
   
   static __inline __m128
   _mm_cmpge_ss (__m128 __A, __m128 __B)
   {
  -  return (__m128) __builtin_ia32_cmpgess ((__v4sf)__A, (__v4sf)__B);
  +  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
  +                                     (__v4sf)
  +                                     __builtin_ia32_cmpless ((__v4sf) __B,
  +                                                             (__v4sf)
  +                                                             __A));
   }
   
   static __inline __m128
  @@ -275,13 +287,21 @@
   static __inline __m128
   _mm_cmpngt_ss (__m128 __A, __m128 __B)
   {
  -  return (__m128) __builtin_ia32_cmpngtss ((__v4sf)__A, (__v4sf)__B);
  +  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
  +                                     (__v4sf)
  +                                     __builtin_ia32_cmpnltss ((__v4sf) __B,
  +                                                              (__v4sf)
  +                                                              __A));
   }
   
   static __inline __m128
   _mm_cmpnge_ss (__m128 __A, __m128 __B)
   {
  -  return (__m128) __builtin_ia32_cmpngess ((__v4sf)__A, (__v4sf)__B);
  +  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
  +                                     (__v4sf)
  +                                     __builtin_ia32_cmpnless ((__v4sf) __B,
  +                                                              (__v4sf)
  +                                                              __A));
   }
   
   static __inline __m128
  @@ -586,7 +606,7 @@
     __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
     __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
     __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
  -  return (__m64) __builtin_ia32_packssdw (__losi, __hisi);
  +  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
   }
   
   /* Convert the four SPFP values in A to four signed 8-bit integers.  */
  @@ -871,7 +891,7 @@
     __builtin_ia32_storeups (__P, (__v4sf)__A);
   }
   
  -/* Store four SPFP values in reverse order.  The addres must be aligned.  */
  +/* Store four SPFP values in reverse order.  The address must be aligned.  */
   static __inline void
   _mm_storer_ps (float *__P, __m128 __A)
   {
  @@ -1017,7 +1037,7 @@
   static __inline void
   _mm_stream_pi (__m64 *__P, __m64 __A)
   {
  -  __builtin_ia32_movntq (__P, __A);
  +  __builtin_ia32_movntq ((long long *)__P, (long long)__A);
   }
   
   /* Likewise.  The address must be 16-byte aligned.  */
  @@ -1058,4 +1078,1112 @@
     (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);                 \
   } while (0)
   
  +#ifdef __SSE2__
  +/* SSE2 */
  +typedef int __v2df __attribute__ ((mode (V2DF)));
  +typedef int __v2di __attribute__ ((mode (V2DI)));
  +typedef int __v4si __attribute__ ((mode (V4SI)));
  +typedef int __v8hi __attribute__ ((mode (V8HI)));
  +typedef int __v16qi __attribute__ ((mode (V16QI)));
  +
  +/* Create a selector for use with the SHUFPD instruction.  */
  +#define _MM_SHUFFLE2(fp1,fp0) \
  + (((fp1) << 1) | (fp0))
  +
  +#define __m128i __v2di
  +#define __m128d __v2df
  +
  +/* Create a vector with element 0 as *P and the rest zero.  */
  +static __inline __m128d
  +_mm_load_sd (double *__P)
  +{
  +  return (__m128d) __builtin_ia32_loadsd (__P);
  +}
  +
  +/* Create a vector with all two elements equal to *P.  */
  +static __inline __m128d
  +_mm_load1_pd (double *__P)
  +{
  +  __v2df __tmp = __builtin_ia32_loadsd (__P);
  +  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
  +}
  +
  +static __inline __m128d
  +_mm_load_pd1 (double *__P)
  +{
  +  return _mm_load1_pd (__P);
  +}
  +
  +/* Load two DPFP values from P.  The addresd must be 16-byte aligned.  */
  +static __inline __m128d
  +_mm_load_pd (double *__P)
  +{
  +  return (__m128d) __builtin_ia32_loadapd (__P);
  +}
  +
  +/* Load two DPFP values from P.  The addresd need not be 16-byte aligned.  */
  +static __inline __m128d
  +_mm_loadu_pd (double *__P)
  +{
  +  return (__m128d) __builtin_ia32_loadupd (__P);
  +}
  +
  +/* Load two DPFP values in reverse order.  The addresd must be aligned.  */
  +static __inline __m128d
  +_mm_loadr_pd (double *__P)
  +{
  +  __v2df __tmp = __builtin_ia32_loadapd (__P);
  +  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
  +}
  +
  +/* Create a vector with element 0 as F and the rest zero.  */
  +static __inline __m128d
  +_mm_set_sd (double __F)
  +{
  +  return (__m128d) __builtin_ia32_loadsd (&__F);
  +}
  +
  +/* Create a vector with all two elements equal to F.  */
  +static __inline __m128d
  +_mm_set1_pd (double __F)
  +{
  +  __v2df __tmp = __builtin_ia32_loadsd (&__F);
  +  return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
  +}
  +
  +static __inline __m128d
  +_mm_set_pd1 (double __F)
  +{
  +  return _mm_set1_pd (__F);
  +}
  +
  +/* Create the vector [Z Y].  */
  +static __inline __m128d
  +_mm_set_pd (double __Z, double __Y)
  +{
  +  union {
  +    double __a[2];
  +    __m128d __v;
  +  } __u;
  +
  +  __u.__a[0] = __Y;
  +  __u.__a[1] = __Z;
  +
  +  return __u.__v;
  +}
  +
  +/* Create the vector [Y Z].  */
  +static __inline __m128d
  +_mm_setr_pd (double __Z, double __Y)
  +{
  +  return _mm_set_pd (__Y, __Z);
  +}
  +
  +/* Create a vector of zeros.  */
  +static __inline __m128d
  +_mm_setzero_pd (void)
  +{
  +  return (__m128d) __builtin_ia32_setzeropd ();
  +}
  +
  +/* Stores the lower DPFP value.  */
  +static __inline void
  +_mm_store_sd (double *__P, __m128d __A)
  +{
  +  __builtin_ia32_storesd (__P, (__v2df)__A);
  +}
  +
  +/* Store the lower DPFP value acrosd two words.  */
  +static __inline void
  +_mm_store1_pd (double *__P, __m128d __A)
  +{
  +  __v2df __va = (__v2df)__A;
  +  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0));
  +  __builtin_ia32_storeapd (__P, __tmp);
  +}
  +
  +static __inline void
  +_mm_store_pd1 (double *__P, __m128d __A)
  +{
  +  _mm_store1_pd (__P, __A);
  +}
  +
  +/* Store two DPFP values.  The addresd must be 16-byte aligned.  */
  +static __inline void
  +_mm_store_pd (double *__P, __m128d __A)
  +{
  +  __builtin_ia32_storeapd (__P, (__v2df)__A);
  +}
  +
  +/* Store two DPFP values.  The addresd need not be 16-byte aligned.  */
  +static __inline void
  +_mm_storeu_pd (double *__P, __m128d __A)
  +{
  +  __builtin_ia32_storeupd (__P, (__v2df)__A);
  +}
  +
  +/* Store two DPFP values in reverse order.  The addresd must be aligned.  */
  +static __inline void
  +_mm_storer_pd (double *__P, __m128d __A)
  +{
  +  __v2df __va = (__v2df)__A;
  +  __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1));
  +  __builtin_ia32_storeapd (__P, __tmp);
  +}
  +
  +/* Sets the low DPFP value of A from the low value of B.  */
  +static __inline __m128d
  +_mm_move_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +
  +static __inline __m128d
  +_mm_add_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_add_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_sub_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_sub_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_mul_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_mul_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_div_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_div_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_sqrt_pd (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
  +}
  +
  +/* Return pair {sqrt (A[0), B[1]}.  */
  +static __inline __m128d
  +_mm_sqrt_sd (__m128d __A, __m128d __B)
  +{
  +  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
  +  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
  +}
  +
  +static __inline __m128d
  +_mm_min_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_min_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_max_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_max_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_and_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_andnot_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_or_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_xor_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpeq_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmplt_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmple_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpgt_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpge_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpneq_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpnlt_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpnle_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpngt_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpnge_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpord_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpunord_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpeq_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmplt_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmple_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpgt_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
  +                                      (__v2df)
  +                                      __builtin_ia32_cmpltsd ((__v2df) __B,
  +                                                              (__v2df)
  +                                                              __A));
  +}
  +
  +static __inline __m128d
  +_mm_cmpge_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
  +                                      (__v2df)
  +                                      __builtin_ia32_cmplesd ((__v2df) __B,
  +                                                              (__v2df)
  +                                                              __A));
  +}
  +
  +static __inline __m128d
  +_mm_cmpneq_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpnlt_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpnle_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpngt_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
  +                                      (__v2df)
  +                                      __builtin_ia32_cmpnltsd ((__v2df) __B,
  +                                                               (__v2df)
  +                                                               __A));
  +}
  +
  +static __inline __m128d
  +_mm_cmpnge_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
  +                                      (__v2df)
  +                                      __builtin_ia32_cmpnlesd ((__v2df) __B,
  +                                                               (__v2df)
  +                                                               __A));
  +}
  +
  +static __inline __m128d
  +_mm_cmpord_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cmpunord_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comieq_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comilt_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comile_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comigt_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comige_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_comineq_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomieq_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomilt_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomile_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomigt_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomige_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_ucomineq_sd (__m128d __A, __m128d __B)
  +{
  +  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_cvtepi32_pd (__m128i __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtepi32_ps (__m128i __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtdq2ps ((__v4si) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtpd_epi32 (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtpd2dq ((__v2df) __A);
  +}
  +
  +static __inline __m64
  +_mm_cvtpd_pi32 (__m128d __A)
  +{
  +  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtpd_ps (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtpd2ps ((__v2df) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvttpd_epi32 (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvttpd2dq ((__v2df) __A);
  +}
  +
  +static __inline __m64
  +_mm_cvttpd_pi32 (__m128d __A)
  +{
  +  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtpi32_pd (__m64 __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtps_epi32 (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtps2dq ((__v4sf) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvttps_epi32 (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvttps2dq ((__v4sf) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtps_pd (__m128d __A)
  +{
  +  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
  +}
  +
  +static __inline int
  +_mm_cvtsd_si32 (__m128d __A)
  +{
  +  return __builtin_ia32_cvtsd2si ((__v2df) __A);
  +}
  +
  +static __inline int
  +_mm_cvttsd_si32 (__m128d __A)
  +{
  +  return __builtin_ia32_cvttsd2si ((__v2df) __A);
  +}
  +
  +static __inline __m128d
  +_mm_cvtsd_ss (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
  +}
  +
  +static __inline __m128d
  +_mm_cvtsi32_sd (__m128d __A, int __B)
  +{
  +  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
  +}
  +
  +static __inline __m128d
  +_mm_cvtss_sd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
  +}
  +
  +#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, 
(__v2df)__B, (__C)))
  +
  +static __inline __m128d
  +_mm_unpackhi_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_unpacklo_pd (__m128d __A, __m128d __B)
  +{
  +  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_loadh_pd (__m128d __A, double *__B)
  +{
  +  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B);
  +}
  +
  +static __inline void
  +_mm_storeh_pd (double *__A, __m128d __B)
  +{
  +  __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128d
  +_mm_loadl_pd (__m128d __A, double *__B)
  +{
  +  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B);
  +}
  +
  +static __inline void
  +_mm_storel_pd (double *__A, __m128d __B)
  +{
  +  __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B);
  +}
  +
  +static __inline int
  +_mm_movemask_pd (__m128d __A)
  +{
  +  return __builtin_ia32_movmskpd ((__v2df)__A);
  +}
  +
  +static __inline __m128i
  +_mm_packs_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_packs_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_packus_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_add_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_add_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_add_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_add_epi64 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddq128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_adds_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_adds_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_adds_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_adds_epu16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sub_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sub_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sub_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sub_epi64 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubq128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_subs_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_subs_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_subs_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_subs_epu16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_madd_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_mulhi_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_mullo_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m64
  +_mm_mul_pu16 (__m64 __A, __m64 __B)
  +{
  +  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_mul_epu16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sll_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sll_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sll_epi64 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sra_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sra_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_srl_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_srl_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_srl_epi64 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_slli_epi16 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_slli_epi32 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_slli_epi64 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_srai_epi16 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_srai_epi32 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
  +}
  +
  +#if 0
  +static __m128i __attribute__((__always_inline__))
  +_mm_srli_si128 (__m128i __A, const int __B)
  +{
  +  return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
  +}
  +
  +static __m128i __attribute__((__always_inline__))
  +_mm_srli_si128 (__m128i __A, const int __B)
  +{
  +  return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
  +}
  +#endif
  +#define _mm_srli_si128(__A, __B) ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
  +#define _mm_slli_si128(__A, __B) ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
  +
  +static __inline __m128i
  +_mm_srli_epi16 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_srli_epi32 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_srli_epi64 (__m128i __A, int __B)
  +{
  +  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
  +}
  +
  +static __inline __m128i
  +_mm_and_si128 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_andnot_si128 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_or_si128 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_xor_si128 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
  +}
  +
  +#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
  +
  +#define _mm_insert_epi16 (__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 
((__v8hi)__A, __B, __C))
  +
  +static __inline __m128i
  +_mm_max_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_max_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_min_epi16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_min_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline int
  +_mm_movemask_epi8 (__m128i __A)
  +{
  +  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
  +}
  +
  +static __inline __m128i
  +_mm_mulhi_epu16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw128 
((__v8hi)__A, __B))
  +#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw128 
((__v8hi)__A, __B))
  +#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, 
__B))
  +
  +static __inline void
  +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
  +{
  +  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
  +}
  +
  +static __inline __m128i
  +_mm_avg_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_avg_epu16 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
  +}
  +
  +static __inline __m128i
  +_mm_sad_epu8 (__m128i __A, __m128i __B)
  +{
  +  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
  +}
  +
  +static __inline void
  +_mm_stream_si32 (int *__A, int __B)
  +{
  +  __builtin_ia32_movnti (__A, __B);
  +}
  +
  +static __inline void
  +_mm_stream_si128 (__m128i *__A, __m128i __B)
  +{
  +  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
  +}
  +
  +static __inline void
  +_mm_stream_pd (__m128d *__A, __m128d __B)
  +{
  +  __builtin_ia32_movntpd ((double *)__A, (__v2df)__B);
  +}
  +
  +static __inline __m128i
  +_mm_movpi64_epi64 (__m64 __A)
  +{
  +  return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
  +}
  +
  +static __inline void
  +_mm_clflush (void *__A)
  +{
  +  return __builtin_ia32_clflush (__A);
  +}
  +
  +static __inline void
  +_mm_lfence (void)
  +{
  +  __builtin_ia32_lfence ();
  +}
  +
  +static __inline void
  +_mm_mfence (void)
  +{
  +  __builtin_ia32_mfence ();
  +}
  +
  +#endif /* __SSE2__  */
  +
  +#endif /* __SSE__ */
   #endif /* _XMMINTRIN_H_INCLUDED */
  
  
  


Reply via email to