llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-compiler-rt-sanitizer Author: BaiXilin (BaiXilin) <details> <summary>Changes</summary> Fixed intrinsic VPDP[SS,SU,UU]D[,S]_128/256/512's argument types to match with the ISA. Fixes part of #<!-- -->97271. --- Patch is 278.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159222.diff 17 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (+18-18) - (modified) clang/lib/Headers/avx10_2_512niintrin.h (+12-12) - (modified) clang/lib/Headers/avxvnniint8intrin.h (+56-32) - (modified) clang/test/CodeGen/X86/avxvnniint8-builtins.c (+12-12) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+18-18) - (modified) llvm/lib/IR/AutoUpgrade.cpp (+63-1) - (modified) llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp (+13-1) - (added) llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll (+291) - (modified) llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll (+27-27) - (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+36-36) - (modified) llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll (+60-60) - (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll (+60-60) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll (+178-118) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll (+216-136) - (modified) llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll (+408-248) - (modified) mlir/include/mlir/Dialect/X86Vector/X86Vector.td (-5) - (modified) mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp (-23) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index aac502091b57e..3be3a5e375ae8 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1106,51 +1106,51 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5 } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">; } let Features = "movrs", Attributes = [NoThrow, Const] in { @@ -4276,12 +4276,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256> let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">; - def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">; + def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">; + def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">; + def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">; + def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">; + def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">; } let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in { diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h index 67679fce82296..fdb57c7c9e27b 100644 --- a/clang/lib/Headers/avx10_2_512niintrin.h +++ b/clang/lib/Headers/avx10_2_512niintrin.h @@ -64,8 +64,8 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U, static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v64qi)__A, + (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -84,8 +84,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v64qi)__A, + (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32( @@ -104,8 +104,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v64qi)__A, + (__v64qu)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -124,8 +124,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v64qi)__A, + (__v64qu)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32( @@ -144,8 +144,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v64qu)__A, + (__v64qu)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -164,8 +164,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v64qu)__A, + (__v64qu)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32( diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h index c211620c68f07..858b66b138f31 100644 --- a/clang/lib/Headers/avxvnniint8intrin.h +++ b/clang/lib/Headers/avxvnniint8intrin.h @@ -14,6 +14,7 @@ #ifndef __AVXVNNIINT8INTRIN_H #define __AVXVNNIINT8INTRIN_H +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -44,10 +45,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbssd_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A), \ + (__v16qi)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -78,10 +81,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpbssd_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A), \ + (__v32qi)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -94,7 +99,7 @@ /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSSDS instruction. /// /// \param __A /// A 128-bit vector of [16 x char]. @@ -113,10 +118,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbssds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A), \ + (__v16qi)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -129,7 +136,7 @@ /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSSDS instruction. /// /// \param __A /// A 256-bit vector of [32 x char]. @@ -148,10 +155,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpbssds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A), \ + (__v32qi)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -163,7 +172,7 @@ /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSUD instruction. /// /// \param __A /// A 128-bit vector of [16 x char]. @@ -182,10 +191,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbsud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A), \ + (__v16qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -197,7 +208,7 @@ /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSUD instruction. /// /// \param __A /// A 256-bit vector of [32 x char]. @@ -216,10 +227,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpbsud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A), \ + (__v32qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -232,7 +245,7 @@ /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSUDS instruction. /// /// \param __A /// A 128-bit vector of [16 x char]. @@ -251,10 +264,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbsuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A), \ + (__v16qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -267,7 +282,7 @@ /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBSUDS instruction. /// /// \param __A /// A 256-bit vector of [32 x char]. @@ -286,10 +301,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpbsuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A), \ + (__v32qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -301,7 +318,7 @@ /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBUUD instruction. /// /// \param __A /// A 128-bit vector of [16 x unsigned char]. @@ -320,10 +337,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbuud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A), \ + (__v16qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -335,7 +354,7 @@ /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B); /// \endcode /// -/// This intrinsic corresponds to the \c VPDPBSSD instruction. +/// This intrinsic corresponds to the \c VPDPBUUD instruction. /// /// \param __A /// A 256-bit vector of [32 x unsigned char]. @@ -354,10 +373,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpbuud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A), \ + (__v32qu)(__B))) +// clang-format off /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -389,10 +410,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpbuuds_epi32(__W, __A, __B) ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/159222 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits