Hi!
On Tue, Oct 17, 2017 at 01:24:45PM -0500, Steven Munroe wrote:
> Some inline assembler is required. There a several cases where we need
> to generate Data Cache Block instruction. There are no existing builtin
> for flush and touch for store transient.
Would builtins for those help? Would anything else want to use such
builtins, I mean?
> + For PowerISA Scalar double in FPRs (left most 64-bits of the
> + low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
> + the XMM. These differences require extra steps on POWER to match
> + the SSE2 scalar double semantics.
Maybe say "is in FPRs"? (And two space after a full stop, here and
elsewhere).
> +/* We need definitions from the SSE header files*/
Dot space space.
> +/* Sets the low DPFP value of A from the low value of B. */
> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_move_sd (__m128d __A, __m128d __B)
> +{
> +#if 1
> + __v2df result = (__v2df) __A;
> + result [0] = ((__v2df) __B)[0];
> + return (__m128d) result;
> +#else
> + return (vec_xxpermdi(__A, __B, 1));
> +#endif
> +}
You probably forgot to finish this? Or, what are the two versions,
and why are they both here? Same question later a few times.
> +/* Add the lower double-precision (64-bit) floating-point element in
> + * a and b, store the result in the lower element of dst, and copy
> + * the upper element from a to the upper element of dst. */
No leading stars on block comments please.
> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_cmpnge_pd (__m128d __A, __m128d __B)
> +{
> + return ((__m128d)vec_cmplt ((__v2df ) __A, (__v2df ) __B));
> +}
You have some spaces before closing parentheses here (and elsewhere --
please check).
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_cvtpd_epi32 (__m128d __A)
> +{
> + __v2df rounded = vec_rint (__A);
> + __v4si result, temp;
> + const __v4si vzero =
> + { 0, 0, 0, 0 };
> +
> + /* VSX Vector truncate Double-Precision to integer and Convert to
> + Signed Integer Word format with Saturate. */
> + __asm__(
> + "xvcvdpsxws %x0,%x1;\n"
> + : "=wa" (temp)
> + : "wa" (rounded)
> + : );
Why the ";\n"? And no empty clobber list please.
> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_cvtps_pd (__m128 __A)
> +{
> + /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
> +#ifdef vec_doubleh
> + return (__m128d) vec_doubleh ((__v4sf)__A);
> +#else
> + /* Otherwise the compiler is not current and so need to generate the
> + equivalent code. */
Do we need this? The compiler will always be current.
> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_loadl_pd (__m128d __A, double const *__B)
> +{
> + __v2df result = (__v2df)__A;
> + result [0] = *__B;
> + return (__m128d)result;
> +}
> +#ifdef _ARCH_PWR8
> +/* Intrinsic functions that require PowerISA 2.07 minimum. */
You want an empty line before that #ifdef.
> +extern __inline int __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_movemask_pd (__m128d __A)
> +{
> + __vector __m64 result;
> + static const __vector unsigned int perm_mask =
> + {
> +#ifdef __LITTLE_ENDIAN__
> + 0x80800040, 0x80808080, 0x80808080, 0x80808080
> +#elif __BIG_ENDIAN__
> + 0x80808080, 0x80808080, 0x80808080, 0x80800040
Wrong indent in the LE case?
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_slli_epi16 (__m128i __A, int __B)
> +{
> + __v8hu lshift;
> + __v8hi result =
> + { 0, 0, 0, 0, 0, 0, 0, 0 };
Could as well fit that on the same line.
> + if (__B < 16)
> + {
> + if (__builtin_constant_p(__B))
> + {
> + lshift = (__v8hu) vec_splat_s16(__B);
> + }
> + else
> + {
> + lshift = vec_splats ((unsigned short) __B);
> + }
No blocks please, for single-line cases.
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
> +{
> +#if 1
> + return (__m128i ) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
> +#else
> + return (__m128i) ((__v16qi)__A == (__v16qi)__B);
> +#endif
> +}
Here's another #if 1.
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_sad_epu8 (__m128i __A, __m128i __B)
> +{
> + __v16qu a, b;
> + __v16qu vmin, vmax, vabsdiff;
> + __v4si vsum;
> + const __v4su zero = { 0, 0, 0, 0 };
> + __v4si result;
> +
> + a = (__v16qu) __A;
> + b = (__v16qu) __B;
> + vmin = vec_min (a, b);
> + vmax = vec_max (a, b);
> + vabsdiff = vec_sub (vmax, vmin);
> + /* Sum four groups of bytes into integers. */
> + vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
> + /* Sum across four integers with two integer results. */
> + result = vec_sum2s (vsum, (__vector signed int) zero);
> + /* Rotate the sums into the correct position. */
> +#ifdef __LITTLE_ENDIAN__
> + result = vec_sld (result, result, 4);
> +#elif __BIG_ENDIAN__
> + result = vec_sld (result, result, 6);
> +#endif
> + /* Rotate the sums into the correct position. */
> + return (__m128i) result;
> +}
That last comment is a pasto I think.
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_stream_si32 (int *__A, int __B)
> +{
> + /* Use the data cache block touch for store transient. */
> + __asm__ (
> + " dcbtstt 0,%0"
No tabs in the asm please.
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> +_mm_clflush (void const *__A)
> +{
> + /* Use the data cache block flush. */
> + __asm__ (
> + " dcbf 0,%0,0"
Does that syntax (with the "L" field) work with all supported assemblers?
Looks good otherwise; please fix the nits (most happen more than once),
and it's okay for trunk. Thanks!
Segher