llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-x86 Author: BaiXilin (BaiXilin) <details> <summary>Changes</summary> Fixed the argument types of the following intrinsics to match with the ISA: - vpdpwssd_128, vpdpwssd_256, vpdpwssd_512, - vpdpwssds_128, vpdpwssds_256, vpdpwssds_512 - vpdpwsud_128, vpdpwsud_256, vpdowsud_512 - vpdpwsuds_128, vpdpwsuds_256, vpdpwsuds_512 - vpdpwusd_128, vpdpwusd_256, vpdpwusd_512 - vpdpwusds_128, vpdpwusds_256, vpdpwusds_512 - vpdpwuud_128, vpdpwuud_256, vpdpwuud_512 - vpdpwuuds_128, vpdpwuuds_256, vpdpwuuds_512 Fixes part of #<!-- -->97271. Note that this is the last PR for the issue. --- Patch is 360.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169456.diff 35 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (+24-24) - (modified) clang/lib/Headers/avx10_2_512niintrin.h (+12-12) - (modified) clang/lib/Headers/avx512vlvnniintrin.h (+9-8) - (modified) clang/lib/Headers/avx512vnniintrin.h (+4-4) - (modified) clang/lib/Headers/avxvnniint16intrin.h (+61-36) - (modified) clang/lib/Headers/avxvnniintrin.h (+8-4) - (modified) clang/test/CodeGen/X86/avx10_2_512ni-builtins.c (+18-18) - (modified) clang/test/CodeGen/X86/avx10_2ni-builtins.c (+24-24) - (modified) clang/test/CodeGen/X86/avx512vlvnni-builtins.c (+12-12) - (modified) clang/test/CodeGen/X86/avx512vnni-builtins.c (+6-6) - (modified) clang/test/CodeGen/X86/avxvnni-builtins.c (+8-8) - (modified) clang/test/CodeGen/X86/avxvnniint16-builtins.c (+12-12) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+30-30) - (modified) llvm/lib/IR/AutoUpgrade.cpp (+153-30) - (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+84-18) - (modified) llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll (+96) - (modified) llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll (+9-9) - (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+12-12) - (modified) llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll (+29-29) - (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll (+32-10) - (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll (+14-14) - (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll (+44) - (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll (+12-12) - (added) llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll (+185) - (modified) llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll (+36-36) - (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll (+20-20) - (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll (+12-12) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll (+199-91) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll (+252-108) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll (+36-36) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll (+36-36) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll (+18-18) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll (+18-18) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll (+12-12) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll (+260-116) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index cb08e2107f072..da34d4123628f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1132,27 +1132,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5 } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { @@ -4325,12 +4325,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512> } let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">; + def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">; + def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">; + def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">; + def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">; + def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">; } let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { @@ -4338,51 +4338,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512> } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">; } let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h index fdb57c7c9e27b..b2215b72c57bc 100644 --- a/clang/lib/Headers/avx10_2_512niintrin.h +++ b/clang/lib/Headers/avx10_2_512niintrin.h @@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32( @@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32( @@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32( diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index a1a0338a69e0d..4b8a199af32e5 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -80,8 +80,8 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -98,8 +98,9 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \ + (__v16hi)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -157,8 +158,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -175,8 +176,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index c386923360de6..2ce88efe4a04f 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h index 805d249911c17..801795ac543dd 100644 --- a/clang/lib/Headers/avxvnniint16intrin.h +++ b/clang/lib/Headers/avxvnniint16intrin.h @@ -15,6 +15,7 @@ #ifndef __AVXVNNIINT16INTRIN_H #define __AVXVNNIINT16INTRIN_H +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -45,10 +46,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwsud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -79,10 +82,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwsud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -114,10 +119,13 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with #define _mm_dpwsuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -149,10 +157,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwsuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -183,10 +193,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwusd_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hi)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -217,10 +229,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwusd_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ - (_... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/169456 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
