https://github.com/ganeshgit updated https://github.com/llvm/llvm-project/pull/182556
>From 2612eaf48a4ac7b53489aed24295c7e37e275ffc Mon Sep 17 00:00:00 2001 From: Ganesh Gopalasubramanian <[email protected]> Date: Fri, 20 Feb 2026 15:34:41 +0530 Subject: [PATCH] [X86] Add AVX512BMM support for AMD Zen 6 (znver6) This patch adds support for AVX512BMM (Bit Matrix Multiply) instruction set extension for AMD Zen 6 processors. AVX512BMM includes three instructions: - VBITREVB: Bit reverse within each byte - VBMACOR: Bit matrix multiply with OR accumulation - VBMACXOR: Bit matrix multiply with XOR accumulation The following implementations for AVX512BMM are added: - Define __AVX512BMM__ macro for znver6 - avx512bmmintrin.h, avx512bmmvlintrin.h header files - Implement _mm_bitrev_epi8, _mm256_bitrev_epi8, _mm512_bitrev_epi8 - Implement _mm256/512_bmacor16x16x16 and bmacxor intrinsics --- clang/docs/ReleaseNotes.rst | 14 + clang/include/clang/Basic/BuiltinsX86.td | 10 + clang/lib/AST/ByteCode/InterpBuiltin.cpp | 57 ++++ clang/lib/AST/ExprConstant.cpp | 58 +++++ clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/CodeGen/TargetBuiltins/X86.cpp | 24 ++ clang/lib/Headers/CMakeLists.txt | 2 + clang/lib/Headers/avx512bmmintrin.h | 174 +++++++++++++ clang/lib/Headers/avx512bmmvlintrin.h | 245 ++++++++++++++++++ clang/lib/Headers/immintrin.h | 4 + clang/test/CodeGen/X86/avx512bmm-builtins.c | 75 ++++++ clang/test/CodeGen/X86/avx512bmmvl-builtins.c | 88 +++++++ clang/test/CodeGen/attr-target-x86.c | 4 +- .../Preprocessor/predefined-arch-macros.c | 2 + compiler-rt/lib/builtins/cpu_model/x86.c | 3 + llvm/include/llvm/IR/IntrinsicsX86.td | 18 ++ .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/Target/X86/X86.td | 6 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 + llvm/lib/Target/X86/X86InstrAVX512.td | 46 ++++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 4 + llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/Target/X86/X86IntrinsicsInfo.h | 6 + llvm/lib/TargetParser/Host.cpp | 1 + llvm/lib/TargetParser/X86TargetParser.cpp | 3 +- .../X86/avx512bmm-vbitrevb-bitreverse.ll | 117 +++++++++ .../X86/avx512bmm-vbitrevb-intrinsics-mem.ll | 141 ++++++++++ .../X86/avx512bmm-vbitrevb-intrinsics.ll | 139 ++++++++++ .../CodeGen/X86/avx512bmm-vbmac-intrinsics.ll | 123 +++++++++ llvm/test/MC/Disassembler/X86/avx512bmm.txt | 78 ++++++ llvm/test/MC/X86/avx512bmm-att.s | 85 ++++++ llvm/test/MC/X86/avx512bmm-intel.s | 85 ++++++ llvm/test/TableGen/x86-fold-tables.inc | 33 +++ .../gn/secondary/clang/lib/Headers/BUILD.gn | 2 + 35 files changed, 1662 insertions(+), 4 deletions(-) create mode 100644 clang/lib/Headers/avx512bmmintrin.h create mode 100644 clang/lib/Headers/avx512bmmvlintrin.h create mode 100644 clang/test/CodeGen/X86/avx512bmm-builtins.c create mode 100644 clang/test/CodeGen/X86/avx512bmmvl-builtins.c create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/avx512bmm.txt create mode 100644 llvm/test/MC/X86/avx512bmm-att.s create mode 100644 llvm/test/MC/X86/avx512bmm-intel.s diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7f056abfbbe242..9c6f2081899156 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -870,6 +870,20 @@ NVPTX Support X86 Support ^^^^^^^^^^^ - ``march=znver6`` is now supported. +- Support ISA of ``AVX512BMM``. + * Support intrinsic of ``_mm512_bmacor16x16x16_v32hi``. + * Support intrinsic of ``_mm512_bmacxor16x16x16_v32hi``. + * Support intrinsic of ``_mm512_mask_bitrev_epi8``. + * Support intrinsic of ``_mm512_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm512_bitrev_epi8``. + * Support intrinsic of ``_mm256_bmacor16x16x16_v16hi``. + * Support intrinsic of ``_mm256_bmacxor16x16x16_v16hi``. + * Support intrinsic of ``_mm_mask_bitrev_epi8``. + * Support intrinsic of ``_mm256_mask_bitrev_epi8``. + * Support intrinsic of ``_mm_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm256_maskz_bitrev_epi8``. + * Support intrinsic of ``_mm_bitrev_epi8``. + * Support intrinsic of ``_mm256_bitrev_epi8``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index c8c371625b5684..dd5f820099bdd5 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -5062,3 +5062,13 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256> let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">; } + +let Features = "avx512bmm", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def bmacor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; + def bmacxor16x16x16_v32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; +} + +let Features = "avx512bmm,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def bmacor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; + def bmacxor16x16x16_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; +} diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b16a34543757b4..c1f9e0588ae8c2 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4441,6 +4441,56 @@ static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC, return true; } +// Bit Matrix Multiply and Accumulate (AVX512BMM). Each 256-bit lane holds a +// 16x16 bit matrix as 16 x i16 elements; element i is row i and bit j of that +// element is entry [i][j]. The accumulator (third argument, src1 in the AMD +// ISA) provides the initial value of each result bit, into which the bit-matrix +// product of the first two arguments (src2 * src3) is reduced with OR (vbmacor) +// or XOR (vbmacxor): +// for i in 0..15, j in 0..15: +// bit = C[16*i+j] +// for k in 0..15: bit OP= A[16*i+k] & B[16*k+j] +// dest[16*i+j] = bit +static bool interp__builtin_ia32_bmac(InterpState &S, CodePtr OpPC, + const CallExpr *Call, bool IsXor) { + assert(Call->getNumArgs() == 3); + const Pointer &C = S.Stk.pop<Pointer>(); + const Pointer &B = S.Stk.pop<Pointer>(); + const Pointer &A = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + unsigned NumElems = A.getNumElems(); + QualType ElemQT = getElemType(A); + OptPrimType ElemT = S.getContext().classify(ElemQT); + bool DstUnsigned = ElemQT->isUnsignedIntegerOrEnumerationType(); + + INT_TYPE_SWITCH_NO_BOOL(*ElemT, { + for (unsigned Lane = 0; Lane < NumElems; Lane += 16) { + for (unsigned I = 0; I < 16; ++I) { + uint16_t AVal = (uint16_t)A.elem<T>(Lane + I).toAPSInt().getZExtValue(); + uint16_t DVal = (uint16_t)C.elem<T>(Lane + I).toAPSInt().getZExtValue(); + for (unsigned J = 0; J < 16; ++J) { + // Seed the reduction with the accumulator bit, then fold in each + // product term with the same operator (OR for vbmacor, XOR for + // vbmacxor). + unsigned Bit = (DVal >> J) & 1u; + for (unsigned K = 0; K < 16; ++K) { + uint16_t BVal = + (uint16_t)B.elem<T>(Lane + K).toAPSInt().getZExtValue(); + unsigned Product = ((AVal >> K) & 1u) & ((BVal >> J) & 1u); + Bit = IsXor ? (Bit ^ Product) : (Bit | Product); + } + DVal = (DVal & ~(uint16_t(1) << J)) | (uint16_t(Bit) << J); + } + Dst.elem<T>(Lane + I) = + static_cast<T>(APSInt(APInt(16, DVal), DstUnsigned)); + } + } + }); + Dst.initializeAllElements(); + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) @@ -5802,6 +5852,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_vgf2p8mulb_v64qi: return interp__builtin_ia32_gfni_mul(S, OpPC, Call); + case X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + return interp__builtin_ia32_bmac(S, OpPC, Call, /*IsXor=*/false); + case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: + return interp__builtin_ia32_bmac(S, OpPC, Call, /*IsXor=*/true); + case X86::BI__builtin_ia32_insertps128: return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) { diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 6ac16c2b831d26..a50fa6a10f9b0b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12638,6 +12638,64 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case clang::X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + case clang::X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + case clang::X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + case clang::X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: { + // Bit Matrix Multiply and Accumulate (AVX512BMM). Each 256-bit lane holds + // a 16x16 bit matrix as 16 x i16 elements; element i is row i and bit j of + // that element is entry [i][j]. The accumulator (third argument, src1 in + // the AMD ISA) provides the initial value of each result bit, into which + // the bit-matrix product of the first two arguments (src2 * src3) is + // reduced with OR (vbmacor) or XOR (vbmacxor): + // for i in 0..15, j in 0..15: + // bit = C[16*i+j] + // for k in 0..15: bit OP= A[16*i+k] & B[16*k+j] + // dest[16*i+j] = bit + APValue SourceA, SourceB, SourceC; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB) || + !EvaluateAsRValue(Info, E->getArg(2), SourceC)) + return false; + + bool IsXor = E->getBuiltinCallee() == + clang::X86::BI__builtin_ia32_bmacxor16x16x16_v16hi || + E->getBuiltinCallee() == + clang::X86::BI__builtin_ia32_bmacxor16x16x16_v32hi; + + unsigned SourceLen = SourceA.getVectorLength(); + auto *DestTy = E->getType()->castAs<VectorType>(); + QualType DestEltTy = DestTy->getElementType(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + + SmallVector<APValue, 32> ResultElements(SourceLen); + for (unsigned Lane = 0; Lane < SourceLen; Lane += 16) { + for (unsigned I = 0; I < 16; ++I) { + uint16_t A = + (uint16_t)SourceA.getVectorElt(Lane + I).getInt().getZExtValue(); + uint16_t Dst = + (uint16_t)SourceC.getVectorElt(Lane + I).getInt().getZExtValue(); + for (unsigned J = 0; J < 16; ++J) { + // Seed the reduction with the accumulator bit, then fold in each + // product term with the same operator (OR for vbmacor, XOR for + // vbmacxor). + unsigned Bit = (Dst >> J) & 1u; + for (unsigned K = 0; K < 16; ++K) { + uint16_t B = (uint16_t)SourceB.getVectorElt(Lane + K) + .getInt() + .getZExtValue(); + unsigned Product = ((A >> K) & 1u) & ((B >> J) & 1u); + Bit = IsXor ? (Bit ^ Product) : (Bit | Product); + } + Dst = (Dst & ~(uint16_t(1) << J)) | (uint16_t(Bit) << J); + } + ResultElements[Lane + I] = + APValue(APSInt(APInt(16, Dst), DestUnsigned)); + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_dbpsadbw128: case clang::X86::BI__builtin_ia32_dbpsadbw256: case clang::X86::BI__builtin_ia32_dbpsadbw512: { diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 18036ba109db0b..28882dd7f36baa 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -296,6 +296,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAVX512DQ = true; } else if (Feature == "+avx512bitalg") { HasAVX512BITALG = true; + } else if (Feature == "+avx512bmm") { + HasAVX512BMM = true; } else if (Feature == "+avx512bw") { HasAVX512BW = true; } else if (Feature == "+avx512vl") { @@ -847,6 +849,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AVX512DQ__"); if (HasAVX512BITALG) Builder.defineMacro("__AVX512BITALG__"); + if (HasAVX512BMM) + Builder.defineMacro("__AVX512BMM__"); if (HasAVX512BW) Builder.defineMacro("__AVX512BW__"); if (HasAVX512VL) { @@ -1093,6 +1097,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("avx512fp16", true) .Case("avx512dq", true) .Case("avx512bitalg", true) + .Case("avx512bmm", true) .Case("avx512bw", true) .Case("avx512vl", true) .Case("avx512vbmi", true) @@ -1214,6 +1219,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("avx512fp16", HasAVX512FP16) .Case("avx512dq", HasAVX512DQ) .Case("avx512bitalg", HasAVX512BITALG) + .Case("avx512bmm", HasAVX512BMM) .Case("avx512bw", HasAVX512BW) .Case("avx512vl", HasAVX512VL) .Case("avx512vbmi", HasAVX512VBMI) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index c8c5d280754b4a..ec94944603c308 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAVX512BF16 = false; bool HasAVX512DQ = false; bool HasAVX512BITALG = false; + bool HasAVX512BMM = false; bool HasAVX512BW = false; bool HasAVX512VL = false; bool HasAVX512VBMI = false; diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 50125a71fcd5f3..a9fa20fae3fe5b 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2686,6 +2686,30 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn); } + case X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: { + Intrinsic::ID ID; + switch (BuiltinID) { + case X86::BI__builtin_ia32_bmacor16x16x16_v16hi: + ID = Intrinsic::x86_avx512_vbmacor_v16hi; + break; + case X86::BI__builtin_ia32_bmacor16x16x16_v32hi: + ID = Intrinsic::x86_avx512_vbmacor_v32hi; + break; + case X86::BI__builtin_ia32_bmacxor16x16x16_v16hi: + ID = Intrinsic::x86_avx512_vbmacxor_v16hi; + break; + case X86::BI__builtin_ia32_bmacxor16x16x16_v32hi: + ID = Intrinsic::x86_avx512_vbmacxor_v32hi; + break; + default: + llvm_unreachable("Unsupported intrinsic!"); + } + + return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); + } // packed comparison intrinsics case X86::BI__builtin_ia32_cmpeqps: case X86::BI__builtin_ia32_cmpeqpd: diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 439f2725168ba1..fc8ddb4284f317 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -193,6 +193,8 @@ set(x86_files avx2intrin.h avx512bf16intrin.h avx512bitalgintrin.h + avx512bmmintrin.h + avx512bmmvlintrin.h avx512bwintrin.h avx512cdintrin.h avx512dqintrin.h diff --git a/clang/lib/Headers/avx512bmmintrin.h b/clang/lib/Headers/avx512bmmintrin.h new file mode 100644 index 00000000000000..97d6e03e972625 --- /dev/null +++ b/clang/lib/Headers/avx512bmmintrin.h @@ -0,0 +1,174 @@ +/*===-------- avx512bmmintrin.h - AVX512BMM intrinsics *------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===---------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BMMINTRIN_H +#define _AVX512BMMINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"), \ + __min_vector_width__(512))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit) +/// matrices in bits [255:0] and [511:256]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit |= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __B +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __C +/// A 512-bit accumulator vector containing the initial values to OR with. +/// \returns A 512-bit vector containing the accumulated result for each lane. +/// \note This instruction does not support masking. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_bmacor16x16x16(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_bmacor16x16x16_v32hi( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__C); +} + +/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit) +/// matrices in bits [255:0] and [511:256]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit ^= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __B +/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit +/// lane). +/// \param __C +/// A 512-bit accumulator vector containing the initial values to XOR with. +/// \returns A 512-bit vector containing the accumulated result for each lane. +/// \note This instruction does not support masking. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_bmacxor16x16x16(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_bmacxor16x16x16_v32hi( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__C); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 512-bit vector of [64 x i8] where each byte will have its bits +/// reversed. +/// \returns A 512-bit vector of [64 x i8] with bit-reversed bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_bitrev_epi8(__m512i __A) { + return (__m512i)__builtin_elementwise_bitreverse((__v64qi)__A); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to conditionally select elements. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the corresponding byte from \a B is copied to the result (merge masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 64-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B. +/// \param __A +/// A 512-bit vector of [64 x i8] to be bit-reversed. +/// \param __B +/// A 512-bit vector of [64 x i8] providing passthrough values. +/// \returns A 512-bit vector combining bit-reversed and passthrough bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_bitrev_epi8(__A), (__v64qi)__B); +} + +/// Reverses the bits within each byte of the source vector, zeroing elements +/// based on the writemask. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the result byte is set to zero (zero masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 64-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 sets the byte to zero. +/// \param __A +/// A 512-bit vector of [64 x i8] to be bit-reversed. +/// \returns A 512-bit vector with bit-reversed or zeroed bytes. +static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_bitrev_epi8(__mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_bitrev_epi8(__A), + (__v64qi)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR + +#endif diff --git a/clang/lib/Headers/avx512bmmvlintrin.h b/clang/lib/Headers/avx512bmmvlintrin.h new file mode 100644 index 00000000000000..b9bad4c844f546 --- /dev/null +++ b/clang/lib/Headers/avx512bmmvlintrin.h @@ -0,0 +1,245 @@ +/*===------------- avx512bmmvlintrin.h - BMM intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __BMMVLINTRIN_H +#define __BMMVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bmm,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bmm,avx512vl"), __min_vector_width__(256))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 256-bit YMM form, the source registers/memory each contain a single +/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit |= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __B +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __C +/// A 256-bit accumulator vector containing the initial values to OR with. +/// \returns A 256-bit vector containing the accumulated result. +/// \note This instruction does not support masking. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_bmacor16x16x16_v16hi( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__C); +} + +/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product +/// into a third 16x16 bit matrix (which is also the destination). +/// +/// For the 256-bit YMM form, the source registers/memory each contain a single +/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs: +/// \code{.operation} +/// for i in 0 to 15 +/// for j in 0 to 15 +/// reduction_bit = __C[16*i+j] +/// for k in 0 to 15 +/// reduction_bit ^= __A[16*i+k] & __B[16*k+j] +/// end for k +/// dest[16*i+j] = reduction_bit +/// end for j +/// end for i +/// \endcode +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction. +/// +/// \param __A +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __B +/// A 256-bit vector containing a 16x16 bit matrix. +/// \param __C +/// A 256-bit accumulator vector containing the initial values to XOR with. +/// \returns A 256-bit vector containing the accumulated result. +/// \note This instruction does not support masking. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_bmacxor16x16x16_v16hi( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__C); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 128-bit vector of [16 x i8] where each byte will have its bits +/// reversed. +/// \returns A 128-bit vector of [16 x i8] with bit-reversed bytes. +static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm128_bitrev_epi8(__m128i __A) { + return (__m128i)__builtin_elementwise_bitreverse((__v16qi)__A); +} + +/// Reverses the bits within each byte of the source vector. +/// +/// For each byte in the source, reverses the order of its 8 bits to generate +/// the corresponding destination byte. For example, 0b10110001 becomes +/// 0b10001101. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __A +/// A 256-bit vector of [32 x i8] where each byte will have its bits +/// reversed. +/// \returns A 256-bit vector of [32 x i8] with bit-reversed bytes. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_bitrev_epi8(__m256i __A) { + return (__m256i)__builtin_elementwise_bitreverse((__v32qi)__A); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to conditionally select elements. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the corresponding byte from \a B is copied to the result (merge masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 16-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B. +/// \param __A +/// A 128-bit vector of [16 x i8] to be bit-reversed. +/// \param __B +/// A 128-bit vector of [16 x i8] providing passthrough values. +/// \returns A 128-bit vector combining bit-reversed and passthrough bytes. +static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm128_bitrev_epi8(__A), (__v16qi)__B); +} + +/// Reverses the bits within each byte of the source vector, using a writemask +/// to conditionally select elements. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the corresponding byte from \a B is copied to the result (merge masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 32-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B. +/// \param __A +/// A 256-bit vector of [32 x i8] to be bit-reversed. +/// \param __B +/// A 256-bit vector of [32 x i8] providing passthrough values. +/// \returns A 256-bit vector combining bit-reversed and passthrough bytes. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_bitrev_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask32)__U, (__v32qi)_mm256_bitrev_epi8(__A), (__v32qi)__B); +} + +/// Reverses the bits within each byte of the source vector, zeroing elements +/// based on the writemask. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the result byte is set to zero (zero masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 16-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 sets the byte to zero. +/// \param __A +/// A 128-bit vector of [16 x i8] to be bit-reversed. +/// \returns A 128-bit vector with bit-reversed or zeroed bytes. +static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm128_maskz_bitrev_epi8(__mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm128_bitrev_epi8(__A), + (__v16qi)_mm_setzero_si128()); +} + +/// Reverses the bits within each byte of the source vector, zeroing elements +/// based on the writemask. +/// +/// For each byte position, if the corresponding mask bit is 1, the byte from +/// \a A has its bits reversed and stored in the result. If the mask bit is 0, +/// the result byte is set to zero (zero masking). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> VBITREV </c> instruction. +/// +/// \param __U +/// A 32-bit mask value where each bit controls one byte (per 8-bit element). +/// A 1 performs bit reversal; a 0 sets the byte to zero. +/// \param __A +/// A 256-bit vector of [32 x i8] to be bit-reversed. +/// \returns A 256-bit vector with bit-reversed or zeroed bytes. +static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_bitrev_epi8(__mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_bitrev_epi8(__A), + (__v32qi)_mm256_setzero_si256()); +} + +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 19064a4ff5cea3..00107c44c3a55f 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -58,6 +58,10 @@ #include <avx512bitalgintrin.h> +#include <avx512bmmintrin.h> + +#include <avx512bmmvlintrin.h> + #include <avx512cdintrin.h> #include <avx512vpopcntdqintrin.h> diff --git a/clang/test/CodeGen/X86/avx512bmm-builtins.c b/clang/test/CodeGen/X86/avx512bmm-builtins.c new file mode 100644 index 00000000000000..75c1ca0c5951ab --- /dev/null +++ b/clang/test/CodeGen/X86/avx512bmm-builtins.c @@ -0,0 +1,75 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror | FileCheck %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + +#include <immintrin.h> +#include "builtin_test_helpers.h" + +__m512i test_mm512_bmacor16x16x16(__m512i __A, __m512i __B, __m512i __C) { + // CHECK-LABEL: test_mm512_bmacor16x16x16 + // CHECK: @llvm.x86.avx512.vbmacor.v32hi + return _mm512_bmacor16x16x16(__A, __B, __C); +} +// All-ones * all-ones with OR reduction sets every result bit (C = 0). +TEST_CONSTEXPR(match_v32hi(_mm512_bmacor16x16x16(_mm512_set1_epi16(-1), _mm512_set1_epi16(-1), _mm512_setzero_si512()), + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); +// A == 0 yields a zero product, so the accumulator passes through unchanged. +TEST_CONSTEXPR(match_v32hi(_mm512_bmacor16x16x16(_mm512_setzero_si512(), _mm512_set1_epi16(-1), _mm512_set1_epi16(0x1234)), + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234)); + +__m512i test_mm512_bmacxor16x16x16(__m512i __A, __m512i __B, __m512i __C) { + // CHECK-LABEL: test_mm512_bmacxor16x16x16 + // CHECK: @llvm.x86.avx512.vbmacxor.v32hi + return _mm512_bmacxor16x16x16(__A, __B, __C); +} +// All-ones * all-ones with XOR reduction: 16 product terms per bit cancel to 0. +TEST_CONSTEXPR(match_v32hi(_mm512_bmacxor16x16x16(_mm512_set1_epi16(-1), _mm512_set1_epi16(-1), _mm512_setzero_si512()), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + +__m512i test_mm512_bitrev_epi8(__m512i __A) { + // CHECK-LABEL: test_mm512_bitrev_epi8 + // CHECK: @llvm.bitreverse.v64i8 + return _mm512_bitrev_epi8(__A); +} +TEST_CONSTEXPR(match_v64qi(_mm512_bitrev_epi8((__m512i)(__v64qi){ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA}), + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55)); + +__m512i test_mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + // CHECK-LABEL: test_mm512_mask_bitrev_epi8 + // CHECK: @llvm.bitreverse.v64i8 + // CHECK: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_mask_bitrev_epi8(__U, __A, __B); +} + +__m512i test_mm512_maskz_bitrev_epi8(__mmask64 __U, __m512i __A) { + // CHECK-LABEL: test_mm512_maskz_bitrev_epi8 + // CHECK: @llvm.bitreverse.v64i8 + // CHECK: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_maskz_bitrev_epi8(__U, __A); +} diff --git a/clang/test/CodeGen/X86/avx512bmmvl-builtins.c b/clang/test/CodeGen/X86/avx512bmmvl-builtins.c new file mode 100644 index 00000000000000..3ece2c2b5ed86a --- /dev/null +++ b/clang/test/CodeGen/X86/avx512bmmvl-builtins.c @@ -0,0 +1,88 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bmm -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + +#include <immintrin.h> +#include "builtin_test_helpers.h" + +__m256i test_mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: test_mm256_bmacor16x16x16 + // CHECK: @llvm.x86.avx512.vbmacor.v16hi + return _mm256_bmacor16x16x16(__A, __B, __C); +} +// All-ones * all-ones with OR reduction sets every result bit (C = 0). +TEST_CONSTEXPR(match_v16hi(_mm256_bmacor16x16x16(_mm256_set1_epi16(-1), _mm256_set1_epi16(-1), _mm256_setzero_si256()), + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); +// A == 0 yields a zero product, so the accumulator passes through unchanged. +TEST_CONSTEXPR(match_v16hi(_mm256_bmacor16x16x16(_mm256_setzero_si256(), _mm256_set1_epi16(-1), _mm256_set1_epi16(0x1234)), + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, + 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234, 0x1234)); + +__m256i test_mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C) { + // CHECK-LABEL: test_mm256_bmacxor16x16x16 + // CHECK: @llvm.x86.avx512.vbmacxor.v16hi + return _mm256_bmacxor16x16x16(__A, __B, __C); +} +// All-ones * all-ones with XOR reduction: 16 product terms per bit cancel to 0. +TEST_CONSTEXPR(match_v16hi(_mm256_bmacxor16x16x16(_mm256_set1_epi16(-1), _mm256_set1_epi16(-1), _mm256_setzero_si256()), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + +__m128i test_mm128_bitrev_epi8(__m128i __A) { + // CHECK-LABEL: test_mm128_bitrev_epi8 + // CHECK: @llvm.bitreverse.v16i8 + return _mm128_bitrev_epi8(__A); +} +TEST_CONSTEXPR(match_v16qi(_mm128_bitrev_epi8((__m128i)(__v16qi){ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA}), + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55)); + +__m256i test_mm256_bitrev_epi8(__m256i __A) { + // CHECK-LABEL: test_mm256_bitrev_epi8 + // CHECK: @llvm.bitreverse.v32i8 + return _mm256_bitrev_epi8(__A); +} +TEST_CONSTEXPR(match_v32qi(_mm256_bitrev_epi8((__m256i)(__v32qi){ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (char)0x80, + 0x00, (char)0xFF, 0x0F, (char)0xF0, 0x33, (char)0xCC, 0x55, (char)0xAA}), + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x00, (char)0xFF, (char)0xF0, 0x0F, (char)0xCC, 0x33, (char)0xAA, 0x55)); + +__m128i test_mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + // CHECK-LABEL: test_mm128_mask_bitrev_epi8 + // CHECK: @llvm.bitreverse.v16i8 + // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm128_mask_bitrev_epi8(__U, __A, __B); +} + +__m128i test_mm128_maskz_bitrev_epi8(__mmask16 __U, __m128i __A) { + // CHECK-LABEL: test_mm128_maskz_bitrev_epi8 + // CHECK: @llvm.bitreverse.v16i8 + // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm128_maskz_bitrev_epi8(__U, __A); +} + +__m256i test_mm256_mask_bitrev_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + // CHECK-LABEL: test_mm256_mask_bitrev_epi8 + // CHECK: @llvm.bitreverse.v32i8 + // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_mask_bitrev_epi8(__U, __A, __B); +} + +__m256i test_mm256_maskz_bitrev_epi8(__mmask32 __U, __m256i __A) { + // CHECK-LABEL: test_mm256_maskz_bitrev_epi8 + // CHECK: @llvm.bitreverse.v32i8 + // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_maskz_bitrev_epi8(__U, __A); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c index 474fa93629d897..6a110ce38605b7 100644 --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -33,7 +33,7 @@ __attribute__((target("fpmath=387"))) void f_fpmath_387(void) {} // CHECK-NOT: tune-cpu -// CHECK: [[f_no_sse2]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: [[f_no_sse2]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bmm,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" __attribute__((target("no-sse2"))) void f_no_sse2(void) {} @@ -41,7 +41,7 @@ void f_no_sse2(void) {} __attribute__((target("sse4"))) void f_sse4(void) {} -// CHECK: [[f_no_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: [[f_no_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1,-avx10.2,-avx2,-avx512bf16,-avx512bitalg,-avx512bmm,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" __attribute__((target("no-sse4"))) void f_no_sse4(void) {} diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index 96f7f68694adbe..2672a599d78ee6 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -4147,6 +4147,7 @@ // CHECK_ZNVER6_M32: #define __AVX2__ 1 // CHECK_ZNVER6_M32: #define __AVX512BF16__ 1 // CHECK_ZNVER6_M32: #define __AVX512BITALG__ 1 +// CHECK_ZNVER6_M32: #define __AVX512BMM__ 1 // CHECK_ZNVER6_M32: #define __AVX512BW__ 1 // CHECK_ZNVER6_M32: #define __AVX512CD__ 1 // CHECK_ZNVER6_M32: #define __AVX512DQ__ 1 @@ -4220,6 +4221,7 @@ // CHECK_ZNVER6_M64: #define __AVX2__ 1 // CHECK_ZNVER6_M64: #define __AVX512BF16__ 1 // CHECK_ZNVER6_M64: #define __AVX512BITALG__ 1 +// CHECK_ZNVER6_M64: #define __AVX512BMM__ 1 // CHECK_ZNVER6_M64: #define __AVX512BW__ 1 // CHECK_ZNVER6_M64: #define __AVX512CD__ 1 // CHECK_ZNVER6_M64: #define __AVX512DQ__ 1 diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c index b0e1f0bdcc5c49..8876ada548f938 100644 --- a/compiler-rt/lib/builtins/cpu_model/x86.c +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -238,6 +238,7 @@ enum ProcessorFeatures { FEATURE_AMX_FP8 = 120, FEATURE_MOVRS, FEATURE_AMX_MOVRS, + FEATURE_AVX512BMM, CPU_FEATURE_MAX }; @@ -1197,6 +1198,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, // AMD cpuid bit for prefetchi is different from Intel if (HasExtLeaf21 && ((EAX >> 20) & 1)) setFeature(FEATURE_PREFETCHI); + if (HasExtLeaf21 && ((EAX >> 23) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512BMM); bool HasLeaf14 = MaxLevel >= 0x14 && !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 5c7785731111cf..fc2298f26716ca 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -7329,4 +7329,22 @@ def int_x86_movrsdi : ClangBuiltin<"__builtin_ia32_movrsdi">, [IntrReadMem]>; def int_x86_prefetchrs : ClangBuiltin<"__builtin_ia32_prefetchrs">, Intrinsic<[], [llvm_ptr_ty], []>; + +//===----------------------------------------------------------------------===// +// BMM intrinsics + +def int_x86_avx512_vbmacor_v16hi : + DefaultAttrsIntrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty], + [IntrNoMem]>; +def int_x86_avx512_vbmacor_v32hi : + DefaultAttrsIntrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; + +def int_x86_avx512_vbmacxor_v16hi : + DefaultAttrsIntrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty], + [IntrNoMem]>; +def int_x86_avx512_vbmacxor_v32hi : + DefaultAttrsIntrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; } +//===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 0194941bb70e0e..310350de5cabb7 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -252,6 +252,7 @@ X86_FEATURE_COMPAT(AMX_TF32, "amx-tf32", 0, 118) X86_FEATURE_COMPAT(AMX_FP8, "amx-fp8", 0, 120) X86_FEATURE_COMPAT(MOVRS, "movrs", 0, 121) X86_FEATURE_COMPAT(AMX_MOVRS, "amx-movrs", 0, 122) +X86_FEATURE_COMPAT(AVX512BMM, "avx512bmm", 0, 123) // Features we don't multiversion on. X86_FEATURE (NF, "nf") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 7551966cb8e158..9244778a811ff5 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -154,6 +154,9 @@ def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true", def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true", "Enable AVX-512 further Vector Byte Manipulation Instructions", [FeatureBWI]>; +def FeatureBMM : SubtargetFeature<"avx512bmm", "HasBMM", "true", + "Enable AVX512 Bit Matrix Multiply", + [FeatureBWI]>; def FeatureAVXIFMA : SubtargetFeature<"avxifma", "HasAVXIFMA", "true", "Enable AVX-IFMA", [FeatureAVX2]>; @@ -1671,7 +1674,8 @@ def ProcessorFeatures { list<SubtargetFeature> ZN6AdditionalFeatures = [FeatureFP16, FeatureAVXVNNIINT8, FeatureAVXNECONVERT, - FeatureAVXIFMA + FeatureAVXIFMA, + FeatureBMM ]; list<SubtargetFeature> ZN6Features = !listconcat(ZN5Features, ZN6AdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e9ba1c05df361a..0cb23854624008 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2355,6 +2355,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } + + if (Subtarget.hasBMM()) { + for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v64i8}) + setOperationAction(ISD::BITREVERSE, VT, Legal); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { @@ -33758,6 +33763,11 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); + // If we have BMM, BITREVERSE on vXi8 is marked Legal and will be handled + // by TableGen pattern matching to VPBITREVB instruction. We should not + // reach here in that case. + assert(!Subtarget.hasBMM() && "BMM should use Legal operation action"); + // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. if (Subtarget.hasGFNI()) { SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 44782de2b46487..2da9abce211d3d 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -11278,6 +11278,26 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[sched.Folded]>; } +// Variant of avx512_unary_rm that requires aligned memory operands +multiclass avx512_unary_rm_aligned<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase, + Sched<[sched]>; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode (_.VT (bitconvert (_.AlignedLdFrag addr:$src1)))))>, + EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded]>; + } +} + multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { @@ -13815,3 +13835,29 @@ let Uses = [MXCSR] in { defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>, T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV; } + +// VPBITREVB - BMM bit reverse instructions +// Basic instruction patterns for BMM (Bit Matrix Multiply) +defm VPBITREVB : avx512_unary_rm_vl<0x81, "vbitrevb", bitreverse, SchedWriteVecALU, + avx512vl_i8_info, HasBMM>, T_MAP6, PS; + +defm : avx512_unary_lowering<"VPBITREVB", bitreverse, avx512vl_i8_info, HasBMM>; + +// VBMACOR, VBMACXOR - BMM matrix multiplication instructions +// VBMACOR: EVEX.256.NP.MAP6.W0 80 /r, EVEX.512.NP.MAP6.W0 80 /r +let Predicates = [HasBMM, HasVLX] in +defm VBMACORZ256 : VNNI_rmb<0x80, "vbmacor16x16x16", x86vbmacor, SchedWriteVecIMul.YMM, v16i16x_info, 0>, + EVEX_V256, T_MAP6; + +let Predicates = [HasBMM] in +defm VBMACORZ : VNNI_rmb<0x80, "vbmacor16x16x16", x86vbmacor, SchedWriteVecIMul.ZMM, v32i16_info, 0>, + EVEX_V512, T_MAP6; + +// VBMACXOR: EVEX.256.NP.MAP6.W1 80 /r, EVEX.512.NP.MAP6.W1 80 /r +let Predicates = [HasBMM, HasVLX] in +defm VBMACXORZ256 : VNNI_rmb<0x80, "vbmacxor16x16x16", x86vbmacxor, SchedWriteVecIMul.YMM, v16i16x_info, 0>, + EVEX_V256, T_MAP6, REX_W; + +let Predicates = [HasBMM] in +defm VBMACXORZ : VNNI_rmb<0x80, "vbmacxor16x16x16", x86vbmacxor, SchedWriteVecIMul.ZMM, v32i16_info, 0>, + EVEX_V512, T_MAP6, REX_W; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 1a75381aaaa24c..572309941c2040 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1781,3 +1781,7 @@ def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), (X86vpmaddwd node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; + +// BMM matrix multiplication operations +def x86vbmacor : SDNode<"X86ISD::VBMACOR", SDTVnni>; +def x86vbmacxor : SDNode<"X86ISD::VBMACXOR", SDTVnni>; diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index afca2e6eafd2c5..49dd6cfc27b2cb 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -86,6 +86,7 @@ def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">; def HasDQI : Predicate<"Subtarget->hasDQI()">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">; +def HasBMM : Predicate<"Subtarget->hasBMM()">; def NoBWI : Predicate<"!Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index a6b0db0230cf3f..d2c9947f30f4b4 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1398,6 +1398,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_uitofp_round, INTR_TYPE_1OP, ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512_vbmacor_v16hi, INTR_TYPE_3OP, X86ISD::VBMACOR, 0), + X86_INTRINSIC_DATA(avx512_vbmacor_v32hi, INTR_TYPE_3OP, X86ISD::VBMACOR, 0), + X86_INTRINSIC_DATA(avx512_vbmacxor_v16hi, INTR_TYPE_3OP, X86ISD::VBMACXOR, + 0), + X86_INTRINSIC_DATA(avx512_vbmacxor_v32hi, INTR_TYPE_3OP, X86ISD::VBMACXOR, + 0), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index f15fc818a24179..457a9e7220459e 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -2136,6 +2136,7 @@ StringMap<bool> sys::getHostCPUFeatures() { !getX86CpuIDAndInfo(0x80000021, &EAX, &EBX, &ECX, &EDX); // AMD cpuid bit for prefetchi is different from Intel Features["prefetchi"] = HasExtLeaf21 && ((EAX >> 20) & 1); + Features["avx512bmm"] = HasExtLeaf21 && ((EAX >> 23) & 1) && HasAVX512Save; bool HasLeaf7 = MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index b924851cd0c532..e3637557ced0d8 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -257,7 +257,7 @@ static constexpr FeatureBitset FeaturesZNVER5 = static constexpr FeatureBitset FeaturesZNVER6 = FeaturesZNVER5 | FeatureAVXVNNIINT8 | FeatureAVX512FP16 | FeatureAVXIFMA | - FeatureAVXNECONVERT; + FeatureAVXNECONVERT | FeatureAVX512BMM; // Hygon architecture processors. constexpr FeatureBitset FeaturesC86_4G_M4 = @@ -622,6 +622,7 @@ constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F; constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW; constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW; +constexpr FeatureBitset ImpliedFeaturesAVX512BMM = FeatureAVX512BW; constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F; constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F; constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F; diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll new file mode 100644 index 00000000000000..834a1561211de6 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-bitreverse.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bmm,+avx512vl,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,VLX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bmm,+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NOVLX + +; Test vbitrevb instruction generation from bitreverse intrinsic +; This test verifies that the bitreverse intrinsic generates vbitrevb instructions +; when AVX512BMM is available. +; Without VLX, 128/256-bit operations are widened to 512-bit zmm registers. + +; Test 512-bit vector bit reversal with aligned memory load +define <64 x i8> @bitrev_zmm_aligned_load(ptr %ptr) { +; CHECK-LABEL: bitrev_zmm_aligned_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <64 x i8>, ptr %ptr, align 64 + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + ret <64 x i8> %1 +} + +; Test 256-bit with aligned memory load (AVX512VL) +define <32 x i8> @bitrev_ymm_aligned_load(ptr %ptr) { +; VLX-LABEL: bitrev_ymm_aligned_load: +; VLX: # %bb.0: # %entry +; VLX-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07] +; VLX-NEXT: retq # encoding: [0xc3] +; +; NOVLX-LABEL: bitrev_ymm_aligned_load: +; NOVLX: # %bb.0: # %entry +; NOVLX-NEXT: vmovdqa (%rdi), %ymm0 # encoding: [0xc5,0xfd,0x6f,0x07] +; NOVLX-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0] +; NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLX-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <32 x i8>, ptr %ptr, align 32 + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + ret <32 x i8> %1 +} + +; Test 128-bit with aligned memory load (AVX512VL + AVX512BW) +define <16 x i8> @bitrev_xmm_aligned_load(ptr %ptr) { +; VLX-LABEL: bitrev_xmm_aligned_load: +; VLX: # %bb.0: # %entry +; VLX-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07] +; VLX-NEXT: retq # encoding: [0xc3] +; +; NOVLX-LABEL: bitrev_xmm_aligned_load: +; NOVLX: # %bb.0: # %entry +; NOVLX-NEXT: vmovdqa (%rdi), %xmm0 # encoding: [0xc5,0xf9,0x6f,0x07] +; NOVLX-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0] +; NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; NOVLX-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <16 x i8>, ptr %ptr, align 16 + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + ret <16 x i8> %1 +} + +; Test 512-bit with unaligned memory load +; Memory operand can be folded directly into vbitrevb (no alignment required) +define <64 x i8> @bitrev_zmm_unaligned_load(ptr %ptr) { +; CHECK-LABEL: bitrev_zmm_unaligned_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <64 x i8>, ptr %ptr, align 1 + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + ret <64 x i8> %1 +} + +; Test 256-bit with unaligned memory load +; Memory operand can be folded directly into vbitrevb (no alignment required) +define <32 x i8> @bitrev_ymm_unaligned_load(ptr %ptr) { +; VLX-LABEL: bitrev_ymm_unaligned_load: +; VLX: # %bb.0: # %entry +; VLX-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07] +; VLX-NEXT: retq # encoding: [0xc3] +; +; NOVLX-LABEL: bitrev_ymm_unaligned_load: +; NOVLX: # %bb.0: # %entry +; NOVLX-NEXT: vmovdqu (%rdi), %ymm0 # encoding: [0xc5,0xfe,0x6f,0x07] +; NOVLX-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0] +; NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLX-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <32 x i8>, ptr %ptr, align 1 + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + ret <32 x i8> %1 +} + +; Test 128-bit with unaligned memory load +; Memory operand can be folded directly into vbitrevb (no alignment required) +define <16 x i8> @bitrev_xmm_unaligned_load(ptr %ptr) { +; VLX-LABEL: bitrev_xmm_unaligned_load: +; VLX: # %bb.0: # %entry +; VLX-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07] +; VLX-NEXT: retq # encoding: [0xc3] +; +; NOVLX-LABEL: bitrev_xmm_unaligned_load: +; NOVLX: # %bb.0: # %entry +; NOVLX-NEXT: vmovdqu (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x6f,0x07] +; NOVLX-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0] +; NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; NOVLX-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <16 x i8>, ptr %ptr, align 1 + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + ret <16 x i8> %1 +} + +declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) +declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll new file mode 100644 index 00000000000000..a1396bbc33ecd5 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics-mem.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl,+avx512bw --show-mc-encoding | FileCheck %s + + +define <2 x i64> @test_mm128_vbitrevb_epi8_mem(ptr %ptr) { +; CHECK-LABEL: test_mm128_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb (%rdi), %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <16 x i8>, ptr %ptr, align 16 + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define <2 x i64> @test_mm128_mask_vbitrevb_epi8_mem(<2 x i64> %src, i16 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm128_mask_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %xmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <16 x i8>, ptr %ptr, align 16 + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast <2 x i64> %src to <16 x i8> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2 + %5 = bitcast <16 x i8> %4 to <2 x i64> + ret <2 x i64> %5 +} + +define <2 x i64> @test_mm128_maskz_vbitrevb_epi8_mem(i16 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm128_maskz_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <16 x i8>, ptr %ptr, align 16 + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + %4 = bitcast <16 x i8> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <4 x i64> @test_mm256_vbitrevb_epi8_mem(ptr %ptr) { +; CHECK-LABEL: test_mm256_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb (%rdi), %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <32 x i8>, ptr %ptr, align 32 + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast <32 x i8> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define <4 x i64> @test_mm256_mask_vbitrevb_epi8_mem(<4 x i64> %src, i32 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm256_mask_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %ymm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <32 x i8>, ptr %ptr, align 32 + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast <4 x i64> %src to <32 x i8> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %1, <32 x i8> %2 + %5 = bitcast <32 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} + +define <4 x i64> @test_mm256_maskz_vbitrevb_epi8_mem(i32 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm256_maskz_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <32 x i8>, ptr %ptr, align 32 + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + %4 = bitcast <32 x i8> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <8 x i64> @test_mm512_vbitrevb_epi8_mem(ptr %ptr) { +; CHECK-LABEL: test_mm512_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb (%rdi), %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <64 x i8>, ptr %ptr, align 64 + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast <64 x i8> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_mask_vbitrevb_epi8_mem(<8 x i64> %src, i64 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm512_mask_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <64 x i8>, ptr %ptr, align 64 + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast <8 x i64> %src to <64 x i8> + %3 = bitcast i64 %mask to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <8 x i64> @test_mm512_maskz_vbitrevb_epi8_mem(i64 %mask, ptr %ptr) { +; CHECK-LABEL: test_mm512_maskz_vbitrevb_epi8_mem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb (%rsi), %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xc9,0x81,0x06] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = load <64 x i8>, ptr %ptr, align 64 + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + %4 = bitcast <64 x i8> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) + +declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) + +declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) + + + diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll new file mode 100644 index 00000000000000..ad1f000792cc16 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512bmm-vbitrevb-intrinsics.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl --show-mc-encoding | FileCheck %s + +define <2 x i64> @test_mm128_vbitrev_epi8(<2 x i64> %a) { +; CHECK-LABEL: test_mm128_vbitrev_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb %xmm0, %xmm0 # encoding: [0x62,0xf6,0x7c,0x08,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %a to <16 x i8> + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_vbitrev_epi8(<4 x i64> %a) { +; CHECK-LABEL: test_mm256_vbitrev_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb %ymm0, %ymm0 # encoding: [0x62,0xf6,0x7c,0x28,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %a to <32 x i8> + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast <32 x i8> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define <8 x i64> @test_mm512_vbitrev_epi8(<8 x i64> %a) { +; CHECK-LABEL: test_mm512_vbitrev_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbitrevb %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %a to <64 x i8> + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast <64 x i8> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define <4 x float> @test_mm128_mask_vbitrevb_epi8(<2 x i64> %a, i64 %mask, <2 x i64> %b) { +; CHECK-LABEL: test_mm128_mask_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x09,0x81,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %conv = trunc i64 %mask to i16 + %0 = bitcast <2 x i64> %b to <16 x i8> + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast <2 x i64> %a to <16 x i8> + %3 = bitcast i16 %conv to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2 + %5 = bitcast <16 x i8> %4 to <4 x float> + ret <4 x float> %5 +} + +define <8 x float> @test_mm256_mask_vbitrevb_epi8(<4 x i64> %a, i64 %mask, <4 x i64> %b) { +; CHECK-LABEL: test_mm256_mask_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x29,0x81,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %conv = trunc i64 %mask to i32 + %0 = bitcast <4 x i64> %b to <32 x i8> + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast <4 x i64> %a to <32 x i8> + %3 = bitcast i32 %conv to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %1, <32 x i8> %2 + %5 = bitcast <32 x i8> %4 to <8 x float> + ret <8 x float> %5 +} + +define <8 x i64> @test_mm512_mask_vbitrevb_epi8(<8 x i64> %a, i64 %mask, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_mask_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf6,0x7c,0x49,0x81,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %b to <64 x i8> + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast <8 x i64> %a to <64 x i8> + %3 = bitcast i64 %mask to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <4 x float> @test_mm128_maskz_vbitrevb_epi8(i64 %mask, <2 x i64> %b) { +; CHECK-LABEL: test_mm128_maskz_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0x89,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %conv = trunc i64 %mask to i16 + %0 = bitcast <2 x i64> %b to <16 x i8> + %1 = tail call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %0) + %2 = bitcast i16 %conv to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + %4 = bitcast <16 x i8> %3 to <4 x float> + ret <4 x float> %4 +} + +define <8 x float> @test_mm256_maskz_vbitrevb_epi8(i64 %mask, <4 x i64> %b) { +; CHECK-LABEL: test_mm256_maskz_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xa9,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %conv = trunc i64 %mask to i32 + %0 = bitcast <4 x i64> %b to <32 x i8> + %1 = tail call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %0) + %2 = bitcast i32 %conv to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + %4 = bitcast <32 x i8> %3 to <8 x float> + ret <8 x float> %4 +} + +define <8 x i64> @test_mm512_maskz_vbitrevb_epi8(i64 %mask, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_maskz_vbitrevb_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; CHECK-NEXT: vbitrevb %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf6,0x7c,0xc9,0x81,0xc0] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %b to <64 x i8> + %1 = tail call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %0) + %2 = bitcast i64 %mask to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + %4 = bitcast <64 x i8> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) +declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) +declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) diff --git a/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll new file mode 100644 index 00000000000000..5236a4fd84360e --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512bmm-vbmac-intrinsics.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bmm,+avx512vl --show-mc-encoding | FileCheck %s + +define <4 x i64> @test_mm256_vbmacor(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) { +; CHECK-LABEL: test_mm256_vbmacor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacor16x16x16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0x80,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %a to <16 x i16> + %1 = bitcast <4 x i64> %b to <16 x i16> + %2 = bitcast <4 x i64> %c to <16 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <4 x i64> @test_mm256_vbmacxor(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) { +; CHECK-LABEL: test_mm256_vbmacxor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacxor16x16x16 %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf6,0xf4,0x28,0x80,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %a to <16 x i16> + %1 = bitcast <4 x i64> %b to <16 x i16> + %2 = bitcast <4 x i64> %c to <16 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacxor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <8 x i64> @test_mm512_vbmacor(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) { +; CHECK-LABEL: test_mm512_vbmacor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacor16x16x16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0x80,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %a to <32 x i16> + %1 = bitcast <8 x i64> %b to <32 x i16> + %2 = bitcast <8 x i64> %c to <32 x i16> + %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <8 x i64> @test_mm512_vbmacxor(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) { +; CHECK-LABEL: test_mm512_vbmacxor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacxor16x16x16 %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf6,0xf4,0x48,0x80,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %a to <32 x i16> + %1 = bitcast <8 x i64> %b to <32 x i16> + %2 = bitcast <8 x i64> %c to <32 x i16> + %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacxor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <4 x i64> @test_mm256_vbmacor_load(<4 x i64> %a, <4 x i64> %b, ptr %p) { +; CHECK-LABEL: test_mm256_vbmacor_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacor16x16x16 (%rdi), %ymm1, %ymm0 # encoding: [0x62,0xf6,0x74,0x28,0x80,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %c = load <4 x i64>, ptr %p + %0 = bitcast <4 x i64> %a to <16 x i16> + %1 = bitcast <4 x i64> %b to <16 x i16> + %2 = bitcast <4 x i64> %c to <16 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <4 x i64> @test_mm256_vbmacxor_load(<4 x i64> %a, <4 x i64> %b, ptr %p) { +; CHECK-LABEL: test_mm256_vbmacxor_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacxor16x16x16 (%rdi), %ymm1, %ymm0 # encoding: [0x62,0xf6,0xf4,0x28,0x80,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %c = load <4 x i64>, ptr %p + %0 = bitcast <4 x i64> %a to <16 x i16> + %1 = bitcast <4 x i64> %b to <16 x i16> + %2 = bitcast <4 x i64> %c to <16 x i16> + %3 = tail call <16 x i16> @llvm.x86.avx512.vbmacxor.v16hi(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <8 x i64> @test_mm512_vbmacor_load(<8 x i64> %a, <8 x i64> %b, ptr %p) { +; CHECK-LABEL: test_mm512_vbmacor_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacor16x16x16 (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf6,0x74,0x48,0x80,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %c = load <8 x i64>, ptr %p + %0 = bitcast <8 x i64> %a to <32 x i16> + %1 = bitcast <8 x i64> %b to <32 x i16> + %2 = bitcast <8 x i64> %c to <32 x i16> + %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <8 x i64> @test_mm512_vbmacxor_load(<8 x i64> %a, <8 x i64> %b, ptr %p) { +; CHECK-LABEL: test_mm512_vbmacxor_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vbmacxor16x16x16 (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf6,0xf4,0x48,0x80,0x07] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + %c = load <8 x i64>, ptr %p + %0 = bitcast <8 x i64> %a to <32 x i16> + %1 = bitcast <8 x i64> %b to <32 x i16> + %2 = bitcast <8 x i64> %c to <32 x i16> + %3 = tail call <32 x i16> @llvm.x86.avx512.vbmacxor.v32hi(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x i16> @llvm.x86.avx512.vbmacor.v16hi(<16 x i16>, <16 x i16>, <16 x i16>) +declare <16 x i16> @llvm.x86.avx512.vbmacxor.v16hi(<16 x i16>, <16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.x86.avx512.vbmacor.v32hi(<32 x i16>, <32 x i16>, <32 x i16>) +declare <32 x i16> @llvm.x86.avx512.vbmacxor.v32hi(<32 x i16>, <32 x i16>, <32 x i16>) diff --git a/llvm/test/MC/Disassembler/X86/avx512bmm.txt b/llvm/test/MC/Disassembler/X86/avx512bmm.txt new file mode 100644 index 00000000000000..08a706c5476f1b --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx512bmm.txt @@ -0,0 +1,78 @@ +# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT +# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL + +# ATT: vbmacor16x16x16 %ymm4, %ymm5, %ymm6 +# INTEL: vbmacor16x16x16 ymm6, ymm5, ymm4 +0x62,0xf6,0x54,0x28,0x80,0xf4 + +# ATT: vbmacor16x16x16 %zmm4, %zmm5, %zmm6 +# INTEL: vbmacor16x16x16 zmm6, zmm5, zmm4 +0x62,0xf6,0x54,0x48,0x80,0xf4 + +# ATT: vbmacor16x16x16 %zmm28, %zmm29, %zmm30 +# INTEL: vbmacor16x16x16 zmm30, zmm29, zmm28 +0x62,0x06,0x14,0x40,0x80,0xf4 + +# ATT: vbmacor16x16x16 (%rcx), %ymm5, %ymm6 +# INTEL: vbmacor16x16x16 ymm6, ymm5, ymmword ptr [rcx] +0x62,0xf6,0x54,0x28,0x80,0x31 + +# ATT: vbmacor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 +# INTEL: vbmacor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] +0x62,0xb6,0x54,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00 + +# ATT: vbmacxor16x16x16 %ymm4, %ymm5, %ymm6 +# INTEL: vbmacxor16x16x16 ymm6, ymm5, ymm4 +0x62,0xf6,0xd4,0x28,0x80,0xf4 + +# ATT: vbmacxor16x16x16 %zmm4, %zmm5, %zmm6 +# INTEL: vbmacxor16x16x16 zmm6, zmm5, zmm4 +0x62,0xf6,0xd4,0x48,0x80,0xf4 + +# ATT: vbmacxor16x16x16 %zmm28, %zmm29, %zmm30 +# INTEL: vbmacxor16x16x16 zmm30, zmm29, zmm28 +0x62,0x06,0x94,0x40,0x80,0xf4 + +# ATT: vbmacxor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 +# INTEL: vbmacxor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] +0x62,0xb6,0xd4,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00 + +# ATT: vbitrevb %xmm5, %xmm6 +# INTEL: vbitrevb xmm6, xmm5 +0x62,0xf6,0x7c,0x08,0x81,0xf5 + +# ATT: vbitrevb %ymm5, %ymm6 +# INTEL: vbitrevb ymm6, ymm5 +0x62,0xf6,0x7c,0x28,0x81,0xf5 + +# ATT: vbitrevb %zmm5, %zmm6 +# INTEL: vbitrevb zmm6, zmm5 +0x62,0xf6,0x7c,0x48,0x81,0xf5 + +# ATT: vbitrevb %zmm29, %zmm30 +# INTEL: vbitrevb zmm30, zmm29 +0x62,0x06,0x7c,0x48,0x81,0xf5 + +# ATT: vbitrevb %zmm5, %zmm6 {%k7} +# INTEL: vbitrevb zmm6 {k7}, zmm5 +0x62,0xf6,0x7c,0x4f,0x81,0xf5 + +# ATT: vbitrevb %zmm5, %zmm6 {%k7} {z} +# INTEL: vbitrevb zmm6 {k7} {z}, zmm5 +0x62,0xf6,0x7c,0xcf,0x81,0xf5 + +# ATT: vbitrevb (%rcx), %xmm6 +# INTEL: vbitrevb xmm6, xmmword ptr [rcx] +0x62,0xf6,0x7c,0x08,0x81,0x31 + +# ATT: vbitrevb (%rcx), %ymm6 +# INTEL: vbitrevb ymm6, ymmword ptr [rcx] +0x62,0xf6,0x7c,0x28,0x81,0x31 + +# ATT: vbitrevb (%rcx), %zmm6 +# INTEL: vbitrevb zmm6, zmmword ptr [rcx] +0x62,0xf6,0x7c,0x48,0x81,0x31 + +# ATT: vbitrevb 291(%rax,%r14,8), %zmm6 {%k7} +# INTEL: vbitrevb zmm6 {k7}, zmmword ptr [rax + 8*r14 + 291] +0x62,0xb6,0x7c,0x4f,0x81,0xb4,0xf0,0x23,0x01,0x00,0x00 diff --git a/llvm/test/MC/X86/avx512bmm-att.s b/llvm/test/MC/X86/avx512bmm-att.s new file mode 100644 index 00000000000000..299d89e40b110e --- /dev/null +++ b/llvm/test/MC/X86/avx512bmm-att.s @@ -0,0 +1,85 @@ +# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s + +# CHECK: vbmacor16x16x16 %ymm4, %ymm5, %ymm6 +# CHECK: encoding: [0x62,0xf6,0x54,0x28,0x80,0xf4] + vbmacor16x16x16 %ymm4, %ymm5, %ymm6 + +# CHECK: vbmacor16x16x16 %zmm4, %zmm5, %zmm6 +# CHECK: encoding: [0x62,0xf6,0x54,0x48,0x80,0xf4] + vbmacor16x16x16 %zmm4, %zmm5, %zmm6 + +# CHECK: vbmacor16x16x16 %ymm28, %ymm29, %ymm30 +# CHECK: encoding: [0x62,0x06,0x14,0x20,0x80,0xf4] + vbmacor16x16x16 %ymm28, %ymm29, %ymm30 + +# CHECK: vbmacor16x16x16 %zmm28, %zmm29, %zmm30 +# CHECK: encoding: [0x62,0x06,0x14,0x40,0x80,0xf4] + vbmacor16x16x16 %zmm28, %zmm29, %zmm30 + +# CHECK: vbmacor16x16x16 (%rcx), %ymm5, %ymm6 +# CHECK: encoding: [0x62,0xf6,0x54,0x28,0x80,0x31] + vbmacor16x16x16 (%rcx), %ymm5, %ymm6 + +# CHECK: vbmacor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 +# CHECK: encoding: [0x62,0xb6,0x54,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbmacor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 + +# CHECK: vbmacxor16x16x16 %ymm4, %ymm5, %ymm6 +# CHECK: encoding: [0x62,0xf6,0xd4,0x28,0x80,0xf4] + vbmacxor16x16x16 %ymm4, %ymm5, %ymm6 + +# CHECK: vbmacxor16x16x16 %zmm4, %zmm5, %zmm6 +# CHECK: encoding: [0x62,0xf6,0xd4,0x48,0x80,0xf4] + vbmacxor16x16x16 %zmm4, %zmm5, %zmm6 + +# CHECK: vbmacxor16x16x16 %zmm28, %zmm29, %zmm30 +# CHECK: encoding: [0x62,0x06,0x94,0x40,0x80,0xf4] + vbmacxor16x16x16 %zmm28, %zmm29, %zmm30 + +# CHECK: vbmacxor16x16x16 (%rcx), %ymm5, %ymm6 +# CHECK: encoding: [0x62,0xf6,0xd4,0x28,0x80,0x31] + vbmacxor16x16x16 (%rcx), %ymm5, %ymm6 + +# CHECK: vbmacxor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 +# CHECK: encoding: [0x62,0xb6,0xd4,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbmacxor16x16x16 291(%rax,%r14,8), %zmm5, %zmm6 + +# CHECK: vbitrevb %xmm5, %xmm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x81,0xf5] + vbitrevb %xmm5, %xmm6 + +# CHECK: vbitrevb %ymm5, %ymm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x81,0xf5] + vbitrevb %ymm5, %ymm6 + +# CHECK: vbitrevb %zmm5, %zmm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x81,0xf5] + vbitrevb %zmm5, %zmm6 + +# CHECK: vbitrevb %zmm29, %zmm30 +# CHECK: encoding: [0x62,0x06,0x7c,0x48,0x81,0xf5] + vbitrevb %zmm29, %zmm30 + +# CHECK: vbitrevb %zmm5, %zmm6 {%k7} +# CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x81,0xf5] + vbitrevb %zmm5, %zmm6 {%k7} + +# CHECK: vbitrevb %zmm5, %zmm6 {%k7} {z} +# CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x81,0xf5] + vbitrevb %zmm5, %zmm6 {%k7} {z} + +# CHECK: vbitrevb (%rcx), %xmm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x81,0x31] + vbitrevb (%rcx), %xmm6 + +# CHECK: vbitrevb (%rcx), %ymm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x81,0x31] + vbitrevb (%rcx), %ymm6 + +# CHECK: vbitrevb (%rcx), %zmm6 +# CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x81,0x31] + vbitrevb (%rcx), %zmm6 + +# CHECK: vbitrevb 291(%rax,%r14,8), %zmm6 {%k7} +# CHECK: encoding: [0x62,0xb6,0x7c,0x4f,0x81,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbitrevb 291(%rax,%r14,8), %zmm6 {%k7} diff --git a/llvm/test/MC/X86/avx512bmm-intel.s b/llvm/test/MC/X86/avx512bmm-intel.s new file mode 100644 index 00000000000000..b01fd6f51d0e1d --- /dev/null +++ b/llvm/test/MC/X86/avx512bmm-intel.s @@ -0,0 +1,85 @@ +# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s + +# CHECK: vbmacor16x16x16 ymm6, ymm5, ymm4 +# CHECK: encoding: [0x62,0xf6,0x54,0x28,0x80,0xf4] + vbmacor16x16x16 ymm6, ymm5, ymm4 + +# CHECK: vbmacor16x16x16 zmm6, zmm5, zmm4 +# CHECK: encoding: [0x62,0xf6,0x54,0x48,0x80,0xf4] + vbmacor16x16x16 zmm6, zmm5, zmm4 + +# CHECK: vbmacor16x16x16 ymm30, ymm29, ymm28 +# CHECK: encoding: [0x62,0x06,0x14,0x20,0x80,0xf4] + vbmacor16x16x16 ymm30, ymm29, ymm28 + +# CHECK: vbmacor16x16x16 zmm30, zmm29, zmm28 +# CHECK: encoding: [0x62,0x06,0x14,0x40,0x80,0xf4] + vbmacor16x16x16 zmm30, zmm29, zmm28 + +# CHECK: vbmacor16x16x16 ymm6, ymm5, ymmword ptr [rcx] +# CHECK: encoding: [0x62,0xf6,0x54,0x28,0x80,0x31] + vbmacor16x16x16 ymm6, ymm5, ymmword ptr [rcx] + +# CHECK: vbmacor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] +# CHECK: encoding: [0x62,0xb6,0x54,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbmacor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] + +# CHECK: vbmacxor16x16x16 ymm6, ymm5, ymm4 +# CHECK: encoding: [0x62,0xf6,0xd4,0x28,0x80,0xf4] + vbmacxor16x16x16 ymm6, ymm5, ymm4 + +# CHECK: vbmacxor16x16x16 zmm6, zmm5, zmm4 +# CHECK: encoding: [0x62,0xf6,0xd4,0x48,0x80,0xf4] + vbmacxor16x16x16 zmm6, zmm5, zmm4 + +# CHECK: vbmacxor16x16x16 zmm30, zmm29, zmm28 +# CHECK: encoding: [0x62,0x06,0x94,0x40,0x80,0xf4] + vbmacxor16x16x16 zmm30, zmm29, zmm28 + +# CHECK: vbmacxor16x16x16 ymm6, ymm5, ymmword ptr [rcx] +# CHECK: encoding: [0x62,0xf6,0xd4,0x28,0x80,0x31] + vbmacxor16x16x16 ymm6, ymm5, ymmword ptr [rcx] + +# CHECK: vbmacxor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] +# CHECK: encoding: [0x62,0xb6,0xd4,0x48,0x80,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbmacxor16x16x16 zmm6, zmm5, zmmword ptr [rax + 8*r14 + 291] + +# CHECK: vbitrevb xmm6, xmm5 +# CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x81,0xf5] + vbitrevb xmm6, xmm5 + +# CHECK: vbitrevb ymm6, ymm5 +# CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x81,0xf5] + vbitrevb ymm6, ymm5 + +# CHECK: vbitrevb zmm6, zmm5 +# CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x81,0xf5] + vbitrevb zmm6, zmm5 + +# CHECK: vbitrevb zmm30, zmm29 +# CHECK: encoding: [0x62,0x06,0x7c,0x48,0x81,0xf5] + vbitrevb zmm30, zmm29 + +# CHECK: vbitrevb zmm6 {k7}, zmm5 +# CHECK: encoding: [0x62,0xf6,0x7c,0x4f,0x81,0xf5] + vbitrevb zmm6 {k7}, zmm5 + +# CHECK: vbitrevb zmm6 {k7} {z}, zmm5 +# CHECK: encoding: [0x62,0xf6,0x7c,0xcf,0x81,0xf5] + vbitrevb zmm6 {k7} {z}, zmm5 + +# CHECK: vbitrevb xmm6, xmmword ptr [rcx] +# CHECK: encoding: [0x62,0xf6,0x7c,0x08,0x81,0x31] + vbitrevb xmm6, xmmword ptr [rcx] + +# CHECK: vbitrevb ymm6, ymmword ptr [rcx] +# CHECK: encoding: [0x62,0xf6,0x7c,0x28,0x81,0x31] + vbitrevb ymm6, ymmword ptr [rcx] + +# CHECK: vbitrevb zmm6, zmmword ptr [rcx] +# CHECK: encoding: [0x62,0xf6,0x7c,0x48,0x81,0x31] + vbitrevb zmm6, zmmword ptr [rcx] + +# CHECK: vbitrevb zmm6 {k7}, zmmword ptr [rax + 8*r14 + 291] +# CHECK: encoding: [0x62,0xb6,0x7c,0x4f,0x81,0xb4,0xf0,0x23,0x01,0x00,0x00] + vbitrevb zmm6 {k7}, zmmword ptr [rax + 8*r14 + 291] diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index b99b6ef60eac75..07466b31e7bb7d 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1635,6 +1635,9 @@ static const X86FoldTableEntry Table1[] = { {X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0}, {X86::VPABSWZrr, X86::VPABSWZrm, 0}, {X86::VPABSWrr, X86::VPABSWrm, 0}, + {X86::VPBITREVBZ128rr, X86::VPBITREVBZ128rm, 0}, + {X86::VPBITREVBZ256rr, X86::VPBITREVBZ256rm, 0}, + {X86::VPBITREVBZrr, X86::VPBITREVBZrm, 0}, {X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE}, {X86::VPBROADCASTBZ128rr, X86::VPBROADCASTBZ128rm, TB_NO_REVERSE}, {X86::VPBROADCASTBZ256rr, X86::VPBROADCASTBZ256rm, TB_NO_REVERSE}, @@ -3314,6 +3317,9 @@ static const X86FoldTableEntry Table2[] = { {X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0}, {X86::VPAVGWZrr, X86::VPAVGWZrm, 0}, {X86::VPAVGWrr, X86::VPAVGWrm, 0}, + {X86::VPBITREVBZ128rrkz, X86::VPBITREVBZ128rmkz, 0}, + {X86::VPBITREVBZ256rrkz, X86::VPBITREVBZ256rmkz, 0}, + {X86::VPBITREVBZrrkz, X86::VPBITREVBZrmkz, 0}, {X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0}, {X86::VPBLENDDrri, X86::VPBLENDDrmi, 0}, {X86::VPBLENDMBZ128rr, X86::VPBLENDMBZ128rm, 0}, @@ -4270,6 +4276,10 @@ static const X86FoldTableEntry Table3[] = { {X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0}, {X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0}, {X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0}, + {X86::VBMACORZ256rr, X86::VBMACORZ256rm, 0}, + {X86::VBMACORZrr, X86::VBMACORZrm, 0}, + {X86::VBMACXORZ256rr, X86::VBMACXORZ256rm, 0}, + {X86::VBMACXORZrr, X86::VBMACXORZrm, 0}, {X86::VBROADCASTF32X2Z256rrk, X86::VBROADCASTF32X2Z256rmk, TB_NO_REVERSE}, {X86::VBROADCASTF32X2Zrrk, X86::VBROADCASTF32X2Zrmk, TB_NO_REVERSE}, {X86::VBROADCASTI32X2Z128rrk, X86::VBROADCASTI32X2Z128rmk, TB_NO_REVERSE}, @@ -5288,6 +5298,9 @@ static const X86FoldTableEntry Table3[] = { {X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0}, {X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0}, {X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0}, + {X86::VPBITREVBZ128rrk, X86::VPBITREVBZ128rmk, 0}, + {X86::VPBITREVBZ256rrk, X86::VPBITREVBZ256rmk, 0}, + {X86::VPBITREVBZrrk, X86::VPBITREVBZrmk, 0}, {X86::VPBLENDMBZ128rrk, X86::VPBLENDMBZ128rmk, 0}, {X86::VPBLENDMBZ256rrk, X86::VPBLENDMBZ256rmk, 0}, {X86::VPBLENDMBZrrk, X86::VPBLENDMBZrmk, 0}, @@ -6114,6 +6127,14 @@ static const X86FoldTableEntry Table4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0}, {X86::VANDPSZrrk, X86::VANDPSZrmk, 0}, + {X86::VBMACORZ256rrk, X86::VBMACORZ256rmk, 0}, + {X86::VBMACORZ256rrkz, X86::VBMACORZ256rmkz, 0}, + {X86::VBMACORZrrk, X86::VBMACORZrmk, 0}, + {X86::VBMACORZrrkz, X86::VBMACORZrmkz, 0}, + {X86::VBMACXORZ256rrk, X86::VBMACXORZ256rmk, 0}, + {X86::VBMACXORZ256rrkz, X86::VBMACXORZ256rmkz, 0}, + {X86::VBMACXORZrrk, X86::VBMACXORZrmk, 0}, + {X86::VBMACXORZrrkz, X86::VBMACXORZrmkz, 0}, {X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmk, 0}, {X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmk, 0}, {X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmk, 0}, @@ -8678,6 +8699,10 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmbk, TB_BCAST_SS}, {X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmbk, TB_BCAST_SS}, {X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmbk, TB_BCAST_SS}, + {X86::VBMACORZ256rr, X86::VBMACORZ256rmb, TB_BCAST_W}, + {X86::VBMACORZrr, X86::VBMACORZrmb, TB_BCAST_W}, + {X86::VBMACXORZ256rr, X86::VBMACXORZ256rmb, TB_BCAST_W}, + {X86::VBMACXORZrr, X86::VBMACXORZrmb, TB_BCAST_W}, {X86::VCMPBF16Z128rrik, X86::VCMPBF16Z128rmbik, TB_BCAST_SH}, {X86::VCMPBF16Z256rrik, X86::VCMPBF16Z256rmbik, TB_BCAST_SH}, {X86::VCMPBF16Zrrik, X86::VCMPBF16Zrmbik, TB_BCAST_SH}, @@ -9790,6 +9815,14 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmbk, TB_BCAST_SS}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmbk, TB_BCAST_SS}, {X86::VANDPSZrrk, X86::VANDPSZrmbk, TB_BCAST_SS}, + {X86::VBMACORZ256rrk, X86::VBMACORZ256rmbk, TB_BCAST_W}, + {X86::VBMACORZ256rrkz, X86::VBMACORZ256rmbkz, TB_BCAST_W}, + {X86::VBMACORZrrk, X86::VBMACORZrmbk, TB_BCAST_W}, + {X86::VBMACORZrrkz, X86::VBMACORZrmbkz, TB_BCAST_W}, + {X86::VBMACXORZ256rrk, X86::VBMACXORZ256rmbk, TB_BCAST_W}, + {X86::VBMACXORZ256rrkz, X86::VBMACXORZ256rmbkz, TB_BCAST_W}, + {X86::VBMACXORZrrk, X86::VBMACXORZrmbk, TB_BCAST_W}, + {X86::VBMACXORZrrkz, X86::VBMACXORZrmbkz, TB_BCAST_W}, {X86::VCVT2PH2BF8SZ128rrk, X86::VCVT2PH2BF8SZ128rmbk, TB_BCAST_SH}, {X86::VCVT2PH2BF8SZ256rrk, X86::VCVT2PH2BF8SZ256rmbk, TB_BCAST_SH}, {X86::VCVT2PH2BF8SZrrk, X86::VCVT2PH2BF8SZrmbk, TB_BCAST_SH}, diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 4af4d43d9bfc64..c48ee5887b9401 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -196,6 +196,8 @@ copy("Headers") { "avx512bf16intrin.h", "avx512bitalgintrin.h", "avx512bwintrin.h", + "avx512bmmintrin.h" + "avx512bmmvlintrin.h" "avx512cdintrin.h", "avx512dqintrin.h", "avx512fintrin.h", _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
