Hi! On Tue, Oct 17, 2017 at 01:24:45PM -0500, Steven Munroe wrote: > Some inline assembler is required. There a several cases where we need > to generate Data Cache Block instruction. There are no existing builtin > for flush and touch for store transient.
Would builtins for those help? Would anything else want to use such builtins, I mean? > + For PowerISA Scalar double in FPRs (left most 64-bits of the > + low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of > + the XMM. These differences require extra steps on POWER to match > + the SSE2 scalar double semantics. Maybe say "is in FPRs"? (And two space after a full stop, here and elsewhere). > +/* We need definitions from the SSE header files*/ Dot space space. > +/* Sets the low DPFP value of A from the low value of B. */ > +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_move_sd (__m128d __A, __m128d __B) > +{ > +#if 1 > + __v2df result = (__v2df) __A; > + result [0] = ((__v2df) __B)[0]; > + return (__m128d) result; > +#else > + return (vec_xxpermdi(__A, __B, 1)); > +#endif > +} You probably forgot to finish this? Or, what are the two versions, and why are they both here? Same question later a few times. > +/* Add the lower double-precision (64-bit) floating-point element in > + * a and b, store the result in the lower element of dst, and copy > + * the upper element from a to the upper element of dst. */ No leading stars on block comments please. > +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_cmpnge_pd (__m128d __A, __m128d __B) > +{ > + return ((__m128d)vec_cmplt ((__v2df ) __A, (__v2df ) __B)); > +} You have some spaces before closing parentheses here (and elsewhere -- please check). > +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_cvtpd_epi32 (__m128d __A) > +{ > + __v2df rounded = vec_rint (__A); > + __v4si result, temp; > + const __v4si vzero = > + { 0, 0, 0, 0 }; > + > + /* VSX Vector truncate Double-Precision to integer and Convert to > + Signed Integer Word format with Saturate. */ > + __asm__( > + "xvcvdpsxws %x0,%x1;\n" > + : "=wa" (temp) > + : "wa" (rounded) > + : ); Why the ";\n"? And no empty clobber list please. > +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_cvtps_pd (__m128 __A) > +{ > + /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */ > +#ifdef vec_doubleh > + return (__m128d) vec_doubleh ((__v4sf)__A); > +#else > + /* Otherwise the compiler is not current and so need to generate the > + equivalent code. */ Do we need this? The compiler will always be current. > +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_loadl_pd (__m128d __A, double const *__B) > +{ > + __v2df result = (__v2df)__A; > + result [0] = *__B; > + return (__m128d)result; > +} > +#ifdef _ARCH_PWR8 > +/* Intrinsic functions that require PowerISA 2.07 minimum. */ You want an empty line before that #ifdef. > +extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_movemask_pd (__m128d __A) > +{ > + __vector __m64 result; > + static const __vector unsigned int perm_mask = > + { > +#ifdef __LITTLE_ENDIAN__ > + 0x80800040, 0x80808080, 0x80808080, 0x80808080 > +#elif __BIG_ENDIAN__ > + 0x80808080, 0x80808080, 0x80808080, 0x80800040 Wrong indent in the LE case? > +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_slli_epi16 (__m128i __A, int __B) > +{ > + __v8hu lshift; > + __v8hi result = > + { 0, 0, 0, 0, 0, 0, 0, 0 }; Could as well fit that on the same line. > + if (__B < 16) > + { > + if (__builtin_constant_p(__B)) > + { > + lshift = (__v8hu) vec_splat_s16(__B); > + } > + else > + { > + lshift = vec_splats ((unsigned short) __B); > + } No blocks please, for single-line cases. > +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) > +{ > +#if 1 > + return (__m128i ) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); > +#else > + return (__m128i) ((__v16qi)__A == (__v16qi)__B); > +#endif > +} Here's another #if 1. > +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_sad_epu8 (__m128i __A, __m128i __B) > +{ > + __v16qu a, b; > + __v16qu vmin, vmax, vabsdiff; > + __v4si vsum; > + const __v4su zero = { 0, 0, 0, 0 }; > + __v4si result; > + > + a = (__v16qu) __A; > + b = (__v16qu) __B; > + vmin = vec_min (a, b); > + vmax = vec_max (a, b); > + vabsdiff = vec_sub (vmax, vmin); > + /* Sum four groups of bytes into integers. */ > + vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); > + /* Sum across four integers with two integer results. */ > + result = vec_sum2s (vsum, (__vector signed int) zero); > + /* Rotate the sums into the correct position. */ > +#ifdef __LITTLE_ENDIAN__ > + result = vec_sld (result, result, 4); > +#elif __BIG_ENDIAN__ > + result = vec_sld (result, result, 6); > +#endif > + /* Rotate the sums into the correct position. */ > + return (__m128i) result; > +} That last comment is a pasto I think. > +extern __inline void __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_stream_si32 (int *__A, int __B) > +{ > + /* Use the data cache block touch for store transient. */ > + __asm__ ( > + " dcbtstt 0,%0" No tabs in the asm please. > +extern __inline void __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_clflush (void const *__A) > +{ > + /* Use the data cache block flush. */ > + __asm__ ( > + " dcbf 0,%0,0" Does that syntax (with the "L" field) work with all supported assemblers? Looks good otherwise; please fix the nits (most happen more than once), and it's okay for trunk. Thanks! Segher