Re: [PATCH] i386: Fix vpblendm{b,w} intrins and insns

2023-04-18 Thread Hongtao Liu via Gcc-patches
On Tue, Apr 18, 2023 at 3:15 PM Haochen Jiang via Gcc-patches
 wrote:
>
> Hi all,
>
> For vpblendm{b,w}, they actually do not have constant parameters.
> Therefore, there is no need for them been wrapped in __OPTIMIZE__.
>
> Also, we should check TARGET_AVX512VL for 128/256 bit vectors in patterns.
>
> This patch did the fixes mentioned above. Tested on x86_64-pc-linux-gnu.
> Ok for trunk?
Ok.
>
> BRs,
> Haochen
>
> gcc/ChangeLog:
>
> * config/i386/avx512vlbwintrin.h
> (_mm_mask_blend_epi16): Remove __OPTIMIZE__ wrapper.
> (_mm_mask_blend_epi8): Ditto.
> (_mm256_mask_blend_epi16): Ditto.
> (_mm256_mask_blend_epi8): Ditto.
> * config/i386/avx512vlintrin.h
> (_mm256_mask_blend_pd): Ditto.
> (_mm256_mask_blend_ps): Ditto.
> (_mm256_mask_blend_epi64): Ditto.
> (_mm256_mask_blend_epi32): Ditto.
> (_mm_mask_blend_pd): Ditto.
> (_mm_mask_blend_ps): Ditto.
> (_mm_mask_blend_epi64): Ditto.
> (_mm_mask_blend_epi32): Ditto.
> * config/i386/sse.md (VF_AVX512BWHFBF16): Removed.
> (VF_AVX512HFBFVL): Move it before the first usage.
> (_blendm): Change iterator from VF_AVX512BWHFBF16
> to VF_AVX512HFBFVL.
> ---
>  gcc/config/i386/avx512vlbwintrin.h |  92 ++-
>  gcc/config/i386/avx512vlintrin.h   | 184 +++--
>  gcc/config/i386/sse.md |  17 ++-
>  3 files changed, 115 insertions(+), 178 deletions(-)
>
> diff --git a/gcc/config/i386/avx512vlbwintrin.h 
> b/gcc/config/i386/avx512vlbwintrin.h
> index 0232783a362..9d2aba2a8ff 100644
> --- a/gcc/config/i386/avx512vlbwintrin.h
> +++ b/gcc/config/i386/avx512vlbwintrin.h
> @@ -257,6 +257,42 @@ _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
>  (__mmask16) __U);
>  }
>
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
> +{
> +  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
> +   (__v8hi) __W,
> +   (__mmask8) __U);
> +}
> +
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
> +{
> +  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
> +   (__v16qi) __W,
> +   (__mmask16) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
> +{
> +  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
> +   (__v16hi) __W,
> +   (__mmask16) __U);
> +}
> +
> +extern __inline __m256i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
> +{
> +  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
> +   (__v32qi) __W,
> +   (__mmask32) __U);
> +}
> +
>  extern __inline __m128i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_cvtepi16_epi8 (__m256i __A)
> @@ -1442,42 +1478,6 @@ _mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, 
> __m128i __B,
> (__mmask8) __U);
>  }
>
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
> -{
> -  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
> -   (__v8hi) __W,
> -   (__mmask8) __U);
> -}
> -
> -extern __inline __m128i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
> -{
> -  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
> -   (__v16qi) __W,
> -   (__mmask16) __U);
> -}
> -
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
> -{
> -  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
> -   (__v16hi) __W,
> -   (__mmask16) __U);
> -}
> -
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> 

[PATCH] i386: Fix vpblendm{b,w} intrins and insns

2023-04-18 Thread Haochen Jiang via Gcc-patches
Hi all,

For vpblendm{b,w}, they actually do not have constant parameters.
Therefore, there is no need for them been wrapped in __OPTIMIZE__.

Also, we should check TARGET_AVX512VL for 128/256 bit vectors in patterns.

This patch did the fixes mentioned above. Tested on x86_64-pc-linux-gnu.
Ok for trunk?

BRs,
Haochen

gcc/ChangeLog:

* config/i386/avx512vlbwintrin.h
(_mm_mask_blend_epi16): Remove __OPTIMIZE__ wrapper.
(_mm_mask_blend_epi8): Ditto.
(_mm256_mask_blend_epi16): Ditto.
(_mm256_mask_blend_epi8): Ditto.
* config/i386/avx512vlintrin.h
(_mm256_mask_blend_pd): Ditto.
(_mm256_mask_blend_ps): Ditto.
(_mm256_mask_blend_epi64): Ditto.
(_mm256_mask_blend_epi32): Ditto.
(_mm_mask_blend_pd): Ditto.
(_mm_mask_blend_ps): Ditto.
(_mm_mask_blend_epi64): Ditto.
(_mm_mask_blend_epi32): Ditto.
* config/i386/sse.md (VF_AVX512BWHFBF16): Removed.
(VF_AVX512HFBFVL): Move it before the first usage.
(_blendm): Change iterator from VF_AVX512BWHFBF16
to VF_AVX512HFBFVL.
---
 gcc/config/i386/avx512vlbwintrin.h |  92 ++-
 gcc/config/i386/avx512vlintrin.h   | 184 +++--
 gcc/config/i386/sse.md |  17 ++-
 3 files changed, 115 insertions(+), 178 deletions(-)

diff --git a/gcc/config/i386/avx512vlbwintrin.h 
b/gcc/config/i386/avx512vlbwintrin.h
index 0232783a362..9d2aba2a8ff 100644
--- a/gcc/config/i386/avx512vlbwintrin.h
+++ b/gcc/config/i386/avx512vlbwintrin.h
@@ -257,6 +257,42 @@ _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
 (__mmask16) __U);
 }
 
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+   (__v8hi) __W,
+   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+   (__v16qi) __W,
+   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+   (__v16hi) __W,
+   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+   (__v32qi) __W,
+   (__mmask32) __U);
+}
+
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_cvtepi16_epi8 (__m256i __A)
@@ -1442,42 +1478,6 @@ _mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i 
__B,
(__mmask8) __U);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
-   (__v8hi) __W,
-   (__mmask8) __U);
-}
-
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
-{
-  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
-   (__v16qi) __W,
-   (__mmask16) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
-   (__v16hi) __W,
-   (__mmask16) __U);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
-{
-  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
-   (__v32qi) __W,
-   (__mmask32) __U);
-}
-
 extern __inline __mmask8