llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-x86 Author: Ganesh (ganeshgit) <details> <summary>Changes</summary> This patch adds support for AVX512BMM (Bit Matrix Multiply) instruction set extension for AMD Zen 6 processors. AVX512BMM includes three instructions: - VBITREVB: Bit reverse within each byte - VBMACOR: Bit matrix multiply with OR accumulation - VBMACXOR: Bit matrix multiply with XOR accumulation The following implementations for AVX512BMM are added: - Define __AVX512BMM__ macro for znver6 - avx512bmmintrin.h, avx512bmmvlintrin.h header files - Implement _mm_bitrev_epi8, _mm256_bitrev_epi8, _mm512_bitrev_epi8 - Implement _mm256/512_bmacor16x16x16 and bmacxor intrinsics --- Patch is 64.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182556.diff 28 Files Affected: - (modified) clang/docs/ReleaseNotes.rst (+14) - (modified) clang/include/clang/Basic/BuiltinsX86.td (+10) - (modified) clang/lib/Basic/Targets/X86.cpp (+6) - (modified) clang/lib/Basic/Targets/X86.h (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+24) - (modified) clang/lib/Headers/CMakeLists.txt (+2) - (added) clang/lib/Headers/avx512bmmintrin.h (+176) - (added) clang/lib/Headers/avx512bmmvlintrin.h (+245) - (modified) clang/lib/Headers/immintrin.h (+4) - (modified) clang/test/CodeGen/attr-target-x86.c (+2-2) - (modified) compiler-rt/lib/builtins/cpu_model/x86.c (+3) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+18) - (modified) llvm/include/llvm/TargetParser/X86TargetParser.def (+1) - (modified) llvm/lib/Target/X86/X86.td (+5-1) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+12-2) - (modified) llvm/lib/Target/X86/X86ISelLowering.h (+4) - (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+61) - (modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+3) - (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+2) - (modified) llvm/lib/Target/X86/X86IntrinsicsInfo.h (+6) - (modified) llvm/lib/TargetParser/Host.cpp (+1) - (modified) llvm/lib/TargetParser/X86TargetParser.cpp (+2-1) - (added) llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll (+85) - (added) llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll (+141) - (added) llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll (+139) - (added) llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll (+63) - (modified) llvm/test/TableGen/x86-fold-tables.inc (+33) - (modified) llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn (+2) ``````````diff diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8d5d704c1766a..84461b2fc211b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -335,6 +335,20 @@ NVPTX Support X86 Support ^^^^^^^^^^^ - ``march=znver6`` is now supported. +- Support ISA of ``AVX512BMM``. + * Support intrinsic of ``_mm512_bmacor16x16x16_v32hi``. + * Support intrinsic of ``_mm512_bmacxor16x16x16_v32hi``. + * Support intrinsic of ``_mm512_mask_bitrev_epi8``. + * Support intrinsic of ``_mm512_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm512_bitrev_epi8``. + * Support intrinsic of ``_mm256_bmacor16x16x16_v16hi``. + * Support intrinsic of ``_mm256_bmacxor16x16x16_v16hi``. + * Support intrinsic of ``_mm_mask_bitrev_epi8``. + * Support intrinsic of ``_mm256_mask_bitrev_epi8``. + * Support intrinsic of ``_mm_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm256_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm_bitrev_epi8``. + * Support intrinsic of ``_mm256_bitrev_epi8``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 23eac47eb5e4c..0d4b40cc1791f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -5055,3 +5055,13 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256> let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">; } + +let Features = "avx512bmm", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def bmacor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; + def bmacxor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; +} + +let Features = "avx512bmm,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def bmacor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; + def bmacxor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; +} diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 6f88a428b1230..2c66d14f5f7b7 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -301,6 +301,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAVX512DQ = true; } else if (Feature == "+avx512bitalg") { HasAVX512BITALG = true; + } else if (Feature == "+avx512bmm") { + HasAVX512BMM = true; } else if (Feature == "+avx512bw") { HasAVX512BW = true; } else if (Feature == "+avx512vl") { @@ -841,6 +843,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AVX512DQ__"); if (HasAVX512BITALG) Builder.defineMacro("__AVX512BITALG__"); + if (HasAVX512BMM) + Builder.defineMacro("__AVX512BMM__"); if (HasAVX512BW) Builder.defineMacro("__AVX512BW__"); if (HasAVX512VL) { @@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("avx512fp16", true) .Case("avx512dq", true) .Case("avx512bitalg", true) + .Case("avx512bmm", true) .Case("avx512bw", true) .Case("avx512vl", true) .Case("avx512vbmi", true) @@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("avx512fp16", HasAVX512FP16) .Case("avx512dq", HasAVX512DQ) .Case("avx512bitalg", HasAVX512BITALG) + .Case("avx512bmm", HasAVX512BMM) .Case("avx512bw", HasAVX512BW) .Case("avx512vl", HasAVX512VL) .Case("avx512vbmi", HasAVX512VBMI) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 922e32906cd04..6bd55f9fbf4bb 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -104,6 +104,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAVX512BF16 = false; bool HasAVX512DQ = false; bool HasAVX512BITALG = false; + bool HasAVX512BMM = false; bool HasAVX512BW = false; bool HasAVX512VL = false; bool HasAVX512VBMI = false; diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 9645ed87b8ef3..4807c66442c92 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2678,6 +2678,30 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn); } + case X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: { + Intrinsic::ID ID; + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + ID = Intrinsic::x86_avx512_vbmacor_v16hi; + break; + case X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + ID = Intrinsic::x86_avx512_vbmacor_v32hi; + break; + case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + ID = Intrinsic::x86_avx512_vbmacxor_v16hi; + break; + case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: + ID = Intrinsic::x86_avx512_vbmacxor_v32hi; + break; + } + + return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); + } // packed comparison intrinsics case X86::BI__builtin_ia32_cmpeqps: case X86::BI__builtin_ia32_cmpeqpd: diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 95d20bbca79ac..5ea3cfa588f82 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -186,6 +186,8 @@ set(x86_files avx2intrin.h avx512bf16intrin.h avx512bitalgintrin.h + avx512bmmintrin.h + avx512bmmvlintrin.h avx512bwintrin.h avx512cdintrin.h avx512dqintrin.h diff --git a/clang/lib/Headers/avx512bmmintrin.h b/clang/lib/Headers/avx512bmmintrin.h new file mode 100644 index 0000000000000..ce4ada8742766 --- /dev/null +++ b/clang/lib/Headers/avx512bmmintrin.h @@ -0,0 +1,176 @@ +/*===-------- avx512bmmintrin.h - AVX512BMM intrinsics *------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===---------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BMMINTRIN_H +#define _AVX512BMMINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"), \ + __min_vector_width__(512))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit) +/// matrices in bits [255:0] and [511:256]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit |= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __B +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __C +/// A 512-bit accumulator vector containing the initial values to OR with. +/// \returns A 512-bit vector containing the accumulated result for each lane. +/// \note This instruction does not support masking. +static __inline __m512i __DEFAULT_FN_ATTRS _mm512_bmacor16x16x16(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_bmacor16x16x16_v32hi( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__C); +} + +/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit) +/// matrices in bits [255:0] and [511:256]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit ^= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __B +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __C +/// A 512-bit accumulator vector containing the initial values to XOR with. +/// \returns A 512-bit vector containing the accumulated result for each lane. +/// \note This instruction does not support masking. +static __inline __m512i __DEFAULT_FN_ATTRS _mm512_bmacxor16x16x16(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_bmacxor16x16x16_v32hi( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__C); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 512-bit vector of [64 x i8] where each byte will have its bits +/// reversed. +/// \returns A 512-bit vector of [64 x i8] with bit-reversed bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_bitrev_epi8(__m512i __A) { + return (__m512i)__builtin_elementwise_bitreverse((__v64qi)__A); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to conditionally select elements. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the corresponding byte from \a B is copied to the result (merge masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 64-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B. +/// \param __A +/// A 512-bit vector of [64 x i8] to be bit-reversed. +/// \param __B +/// A 512-bit vector of [64 x i8] providing passthrough values. +/// \returns A 512-bit vector combining bit-reversed and passthrough bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_bitrev_epi8(__A), (__v64qi)__B); +} + +/// Reverses the bits within each byte of the source vector, zeroing elements +/// based on the writemask. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the result byte is set to zero (zero masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 64-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 sets the byte to zero. +/// \param __A +/// A 512-bit vector of [64 x i8] to be bit-reversed. +/// \returns A 512-bit vector with bit-reversed or zeroed bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_bitrev_epi8(__mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_bitrev_epi8(__A), + (__v64qi)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR + +#endif diff --git a/clang/lib/Headers/avx512bmmvlintrin.h b/clang/lib/Headers/avx512bmmvlintrin.h new file mode 100644 index 0000000000000..68a04db460047 --- /dev/null +++ b/clang/lib/Headers/avx512bmmvlintrin.h @@ -0,0 +1,245 @@ +/*===------------- avx512bmvlintrin.h - BMM intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __BMMVLINTRIN_H +#define __BMMVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bmm,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bmm,avx512vl"), __min_vector_width__(256))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 256-bit YMM form, the source registers/memory each contain a single +/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit |= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __B +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __C +/// A 256-bit accumulator vector containing the initial values to OR with. +/// \returns A 256-bit vector containing the accumulated result. +/// \note This instruction does not support masking. +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_bmacor16x16x16_v16hi( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__C); +} + +/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 256-bit YMM form, the source registers/memory each contain a single +/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit ^= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __B +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __C +/// A 256-bit accumulator vector containing the initial values to XOR with. +/// \returns A 256-bit vector containing the accumulated result. +/// \note This instruction does not support masking. +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_bmacxor16x16x16_v16hi( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__C); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x i8] where each byte will have its bits +/// reversed. +/// \returns A 128-bit vector of [16 x i8] with bit-reversed bytes. +static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm128_bitrev_epi8(__m128i __A) { + return (__m128i)__builtin_elementwise_bitreverse((__v16qi)__A); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x i8] where each byte will have its bits +/// reversed. +/// \returns A 256-bit vector of [32 x i8] with bit-reversed bytes. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_bitrev_epi8(__m256i __A) { + return (__m256i)__builtin_elementwise_bitreverse((__v32qi)__A); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to conditionally select elements. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the corresponding byte from \a B is copied to the result (merge masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 16-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B. +/// \param __A +/// A 128-bit vector of [16 x i8] to be bit-reversed. +/// \param __B +/// A 128-bit vector of [16 x i8] providing passthrough values. +/// \returns A 128-bit vector combining bit-reversed and passthrough bytes. +static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm128_bitrev_epi8(__A), (__v16qi)__B); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to con... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/182556 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
