Re: [PATCH, rs6000] 1/2 Add x86 SSE2 intrinsics to GCC PPC64LE target

Segher Boessenkool Mon, 23 Oct 2017 14:22:16 -0700

Hi!

On Tue, Oct 17, 2017 at 01:24:45PM -0500, Steven Munroe wrote:
> Some inline assembler is required. There a several cases where we need 
> to generate Data Cache Block instruction. There are no existing builtin
> for flush and touch for store transient.


Would builtins for those help?  Would anything else want to use such
builtins, I mean?

> +   For PowerISA Scalar double in FPRs (left most 64-bits of the
> +   low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
> +   the XMM. These differences require extra steps on POWER to match
> +   the SSE2 scalar double semantics.

Maybe say "is in FPRs"?  (And two space after a full stop, here and
elsewhere).

> +/* We need definitions from the SSE header files*/

Dot space space.

> +/* Sets the low DPFP value of A from the low value of B.  */
> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_move_sd (__m128d __A, __m128d __B)
> +{
> +#if 1
> +  __v2df result = (__v2df) __A;
> +  result [0] = ((__v2df) __B)[0];
> +  return (__m128d) result;
> +#else
> +  return (vec_xxpermdi(__A, __B, 1));
> +#endif
> +}

You probably forgot to finish this?  Or, what are the two versions,
and why are they both here?  Same question later a few times.

> +/* Add the lower double-precision (64-bit) floating-point element in
> + * a and b, store the result in the lower element of dst, and copy
> + * the upper element from a to the upper element of dst. */

No leading stars on block comments please.

> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_cmpnge_pd (__m128d __A, __m128d __B)
> +{
> +  return ((__m128d)vec_cmplt ((__v2df ) __A, (__v2df ) __B));
> +}

You have some spaces before closing parentheses here (and elsewhere --
please check).

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_cvtpd_epi32 (__m128d __A)
> +{
> +  __v2df rounded = vec_rint (__A);
> +  __v4si result, temp;
> +  const __v4si vzero =
> +    { 0, 0, 0, 0 };
> +
> +  /* VSX Vector truncate Double-Precision to integer and Convert to
> +   Signed Integer Word format with Saturate.  */
> +  __asm__(
> +      "xvcvdpsxws %x0,%x1;\n"
> +      : "=wa" (temp)
> +      : "wa" (rounded)
> +      : );

Why the ";\n"?  And no empty clobber list please.

> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_cvtps_pd (__m128 __A)
> +{
> +  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
> +#ifdef vec_doubleh
> +  return (__m128d) vec_doubleh ((__v4sf)__A);
> +#else
> +  /* Otherwise the compiler is not current and so need to generate the
> +     equivalent code.  */

Do we need this?  The compiler will always be current.

> +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_loadl_pd (__m128d __A, double const *__B)
> +{
> +  __v2df result = (__v2df)__A;
> +  result [0] = *__B;
> +  return (__m128d)result;
> +}
> +#ifdef _ARCH_PWR8
> +/* Intrinsic functions that require PowerISA 2.07 minimum.  */

You want an empty line before that #ifdef.

> +extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_movemask_pd (__m128d  __A)
> +{
> +  __vector __m64 result;
> +  static const __vector unsigned int perm_mask =
> +    {
> +#ifdef __LITTLE_ENDIAN__
> +     0x80800040, 0x80808080, 0x80808080, 0x80808080
> +#elif __BIG_ENDIAN__
> +      0x80808080, 0x80808080, 0x80808080, 0x80800040

Wrong indent in the LE case?

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_slli_epi16 (__m128i __A, int __B)
> +{
> +  __v8hu lshift;
> +  __v8hi result =
> +    { 0, 0, 0, 0, 0, 0, 0, 0 };

Could as well fit that on the same line.

> +  if (__B < 16)
> +    {
> +      if (__builtin_constant_p(__B))
> +     {
> +       lshift = (__v8hu) vec_splat_s16(__B);
> +     }
> +      else
> +     {
> +       lshift = vec_splats ((unsigned short) __B);
> +     }

No blocks please, for single-line cases.

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
> +{
> +#if 1
> +  return (__m128i ) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
> +#else
> +  return (__m128i) ((__v16qi)__A == (__v16qi)__B);
> +#endif
> +}

Here's another #if 1.

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_sad_epu8 (__m128i __A, __m128i __B)
> +{
> +  __v16qu a, b;
> +  __v16qu vmin, vmax, vabsdiff;
> +  __v4si vsum;
> +  const __v4su zero = { 0, 0, 0, 0 };
> +  __v4si result;
> +
> +  a = (__v16qu) __A;
> +  b = (__v16qu) __B;
> +  vmin = vec_min (a, b);
> +  vmax = vec_max (a, b);
> +  vabsdiff = vec_sub (vmax, vmin);
> +  /* Sum four groups of bytes into integers.  */
> +  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
> +  /* Sum across four integers with two integer results.  */
> +  result = vec_sum2s (vsum, (__vector signed int) zero);
> +  /* Rotate the sums into the correct position.  */
> +#ifdef __LITTLE_ENDIAN__
> +  result = vec_sld (result, result, 4);
> +#elif __BIG_ENDIAN__
> +  result = vec_sld (result, result, 6);
> +#endif
> +  /* Rotate the sums into the correct position.  */
> +  return (__m128i) result;
> +}

That last comment is a pasto I think.

> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_stream_si32 (int *__A, int __B)
> +{
> +  /* Use the data cache block touch for store transient.  */
> +  __asm__ (
> +    "        dcbtstt 0,%0"

No tabs in the asm please.

> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_clflush (void const *__A)
> +{
> +  /* Use the data cache block flush.  */
> +  __asm__ (
> +    "        dcbf    0,%0,0"

Does that syntax (with the "L" field) work with all supported assemblers?


Looks good otherwise; please fix the nits (most happen more than once),
and it's okay for trunk.  Thanks!


Segher

Re: [PATCH, rs6000] 1/2 Add x86 SSE2 intrinsics to GCC PPC64LE target

Reply via email to