Hello,< > and == for integer vectors of size 128. I was surprised not to find _mm_cmplt_epi64 anywhere. Note that I can do the same for size 256, but not 512, there is no corresponding intrinsic, there are only _mask versions that return a mask.
For gcc-5, we should stop either after 5/n or after 7/n (avx2 version of 6/n).
Regtested with 5/n. 2014-11-10 Marc Glisse <marc.gli...@inria.fr> * config/i386/emmintrin.h (_mm_cmpeq_epi8, _mm_cmpeq_epi16, _mm_cmpeq_epi32, _mm_cmplt_epi8, _mm_cmplt_epi16, _mm_cmplt_epi32, _mm_cmpgt_epi8, _mm_cmpgt_epi16, _mm_cmpgt_epi32): Use vector extensions instead of builtins. * config/i386/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64): Likewise. -- Marc Glisse
Index: emmintrin.h =================================================================== --- emmintrin.h (revision 217263) +++ emmintrin.h (working copy) @@ -1268,69 +1268,69 @@ _mm_or_si128 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_xor_si128 (__m128i __A, __m128i __B) { return (__m128i) ((__v2du)__A ^ (__v2du)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); + return (__m128i) ((__v16qi)__A == (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hi)__A == (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); + return (__m128i) ((__v4si)__A == (__v4si)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); + return (__m128i) ((__v16qi)__A < (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); + return (__m128i) ((__v8hi)__A < (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmplt_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); + return (__m128i) ((__v4si)__A < (__v4si)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi8 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); + return (__m128i) ((__v16qi)__A > (__v16qi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi16 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); + return (__m128i) ((__v8hi)__A > (__v8hi)__B); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); + return (__m128i) ((__v4si)__A > (__v4si)__B); } #ifdef __OPTIMIZE__ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_extract_epi16 (__m128i const __A, int const __N) { return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) Index: smmintrin.h =================================================================== --- smmintrin.h (revision 217259) +++ smmintrin.h (working copy) @@ -260,21 +260,21 @@ _mm_dp_pd (__m128d __X, __m128d __Y, con #define _mm_dp_pd(X, Y, M) \ ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (int)(M))) #endif /* Packed integer 64-bit comparison, zeroing or filling with ones corresponding parts of result. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_epi64 (__m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y); + return (__m128i) ((__v2di)__X == (__v2di)__Y); } /* Min/max packed integer instructions. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_epi8 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); } @@ -788,21 +788,21 @@ _mm_cmpestrz (__m128i __X, int __LX, __m ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \ (__v16qi)(__m128i)(Y), (int)(LY), \ (int)(M))) #endif /* Packed integer 64-bit comparison, zeroing or filling with ones corresponding parts of result. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpgt_epi64 (__m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y); + return (__m128i) ((__v2di)__X > (__v2di)__Y); } #ifdef __DISABLE_SSE4_2__ #undef __DISABLE_SSE4_2__ #pragma GCC pop_options #endif /* __DISABLE_SSE4_2__ */ #ifdef __DISABLE_SSE4_1__ #undef __DISABLE_SSE4_1__ #pragma GCC pop_options