[clang] 939d830 - [X86-64] Support Intel AMX Intrinsic

Xiang1 Zhang via cfe-commits Mon, 06 Jul 2020 19:14:49 -0700

Author: Xiang1 Zhang
Date: 2020-07-07T10:13:40+08:00
New Revision: 939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4


URL: 
https://github.com/llvm/llvm-project/commit/939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4
DIFF: 
https://github.com/llvm/llvm-project/commit/939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4.diff

LOG: [X86-64] Support Intel AMX Intrinsic

INTEL ADVANCED MATRIX EXTENSIONS (AMX).
AMX is a new programming paradigm, it has a set of 2-dimensional registers
(TILES) representing sub-arrays from a larger 2-dimensional memory image and
operate on TILES.

These intrinsics use direct TMM register number as its params.

Spec can be found in Chapter 3 here 
https://software.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D83111

Added: 
    clang/lib/Headers/amxintrin.h
    clang/test/CodeGen/AMX/amx.c
    clang/test/CodeGen/AMX/amx_errors.c
    clang/test/CodeGen/AMX/amx_inline_asm.c
    clang/test/Preprocessor/x86_amx_target_features.c
    llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
    llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
    llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll

Modified: 
    clang/docs/ClangCommandLineReference.rst
    clang/include/clang/Basic/BuiltinsX86_64.def
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/include/clang/Driver/Options.td
    clang/include/clang/Sema/Sema.h
    clang/lib/Basic/Targets/X86.cpp
    clang/lib/Basic/Targets/X86.h
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/cpuid.h
    clang/lib/Headers/immintrin.h
    clang/lib/Sema/SemaChecking.cpp
    clang/test/Driver/x86-target-features.c
    llvm/include/llvm/IR/IntrinsicsX86.td
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrAMX.td

Removed: 
    


################################################################################
diff  --git a/clang/docs/ClangCommandLineReference.rst 
b/clang/docs/ClangCommandLineReference.rst
index 67c341feffbb..672c4ae80e73 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -3127,6 +3127,12 @@ X86
 
 .. option:: -maes, -mno-aes
 
+.. option:: -mamx-bf16, -mno-amx-bf16
+
+.. option:: -mamx-int8, -mno-amx-int8
+
+.. option:: -mamx-tile, -mno-amx-tile
+
 .. option:: -mavx, -mno-avx
 
 .. option:: -mavx2, -mno-avx2

diff  --git a/clang/include/clang/Basic/BuiltinsX86_64.def 
b/clang/include/clang/Basic/BuiltinsX86_64.def
index c535f43203e5..7feccd2a81a0 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -101,6 +101,22 @@ TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", 
"ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", 
"avx512f")
 TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
+
+// AMX
+TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+
+TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile")
+
+TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16")
 TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
 
 #undef BUILTIN

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5b94aa8c4325..c935545610e0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9342,6 +9342,8 @@ def err_x86_builtin_invalid_rounding : Error<
   "invalid rounding argument">;
 def err_x86_builtin_invalid_scale : Error<
   "scale argument must be 1, 2, 4, or 8">;
+def err_x86_builtin_tile_arg_duplicate : Error<
+  "tile arguments must refer to 
diff erent tiles">;
 
 def err_builtin_target_unsupported : Error<
   "builtin is not supported on this target">;

diff  --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 50d18343f7d4..745c696bcaa3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3065,6 +3065,12 @@ def m3dnow : Flag<["-"], "m3dnow">, 
Group<m_x86_Features_Group>;
 def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
 def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
 def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
+def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
+def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
+def mtamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
+def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
+def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
+def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
 def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
 def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
 def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;

diff  --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 9b82d2c984be..8ee7dd74712d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12142,6 +12142,13 @@ class Sema final {
   bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr 
*TheCall);
+  bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
+                                         ArrayRef<int> ArgNums);
+  bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum);
+  bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
+  bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                            ArrayRef<int> ArgNums);
   bool CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,

diff  --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 2c6742b9042a..ed62848d8070 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -62,6 +62,7 @@ static const char *const GCCRegNames[] = {
     "cr0",   "cr2",   "cr3",   "cr4",   "cr8",
     "dr0",   "dr1",   "dr2",   "dr3",   "dr6",     "dr7",
     "bnd0",  "bnd1",  "bnd2",  "bnd3",
+    "tmm0",  "tmm1",  "tmm2",  "tmm3",  "tmm4",    "tmm5",  "tmm6",  "tmm7",
 };
 
 const TargetInfo::AddlRegName AddlRegNames[] = {
@@ -394,7 +395,10 @@ void 
X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
   } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
     if (Enabled)
       Features["xsave"] = true;
-  }
+  } else if (Name == "amx-tile" && !Enabled) {
+    Features["amx-bf16"] = Features["amx-int8"] = false;
+  } else if ((Name == "amx-bf16" || Name == "amx-int8") && Enabled)
+    Features["amx-tile"] = true;
 }
 
 /// handleTargetFeatures - Perform initialization based on the user
@@ -529,6 +533,12 @@ bool 
X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasINVPCID = true;
     } else if (Feature == "+enqcmd") {
       HasENQCMD = true;
+    } else if (Feature == "+amx-bf16") {
+      HasAMXBF16 = true;
+    } else if (Feature == "+amx-int8") {
+      HasAMXINT8 = true;
+    } else if (Feature == "+amx-tile") {
+      HasAMXTILE = true;
     } else if (Feature == "+serialize") {
       HasSERIALIZE = true;
     } else if (Feature == "+tsxldtrk") {
@@ -924,6 +934,12 @@ void X86TargetInfo::getTargetDefines(const LangOptions 
&Opts,
     Builder.defineMacro("__INVPCID__");
   if (HasENQCMD)
     Builder.defineMacro("__ENQCMD__");
+  if (HasAMXTILE)
+    Builder.defineMacro("__AMXTILE__");
+  if (HasAMXINT8)
+    Builder.defineMacro("__AMXINT8__");
+  if (HasAMXBF16)
+    Builder.defineMacro("__AMXBF16__");
   if (HasSERIALIZE)
     Builder.defineMacro("__SERIALIZE__");
   if (HasTSXLDTRK)
@@ -1020,6 +1036,9 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) 
const {
       .Case("3dnowa", true)
       .Case("adx", true)
       .Case("aes", true)
+      .Case("amx-bf16", true)
+      .Case("amx-int8", true)
+      .Case("amx-tile", true)
       .Case("avx", true)
       .Case("avx2", true)
       .Case("avx512f", true)
@@ -1102,6 +1121,9 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("adx", HasADX)
       .Case("aes", HasAES)
+      .Case("amx-bf16", HasAMXBF16)
+      .Case("amx-int8", HasAMXINT8)
+      .Case("amx-tile", HasAMXTILE)
       .Case("avx", SSELevel >= AVX)
       .Case("avx2", SSELevel >= AVX2)
       .Case("avx512f", SSELevel >= AVX512F)

diff  --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index c33c608e27c8..623ac9474b5c 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -125,6 +125,9 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public 
TargetInfo {
   bool HasPTWRITE = false;
   bool HasINVPCID = false;
   bool HasENQCMD = false;
+  bool HasAMXTILE = false;
+  bool HasAMXINT8 = false;
+  bool HasAMXBF16 = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
 

diff  --git a/clang/lib/Headers/CMakeLists.txt 
b/clang/lib/Headers/CMakeLists.txt
index fd9e3a0d672f..e7bee192d918 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -2,6 +2,7 @@ set(files
   adxintrin.h
   altivec.h
   ammintrin.h
+  amxintrin.h
   arm_acle.h
   arm_cmse.h
   armintr.h

diff  --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
new file mode 100644
index 000000000000..58254e21c81a
--- /dev/null
+++ b/clang/lib/Headers/amxintrin.h
@@ -0,0 +1,225 @@
+/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ 
-*---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ 
*===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXINTRIN_H
+#define __AMXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
+
+/// Load tile configuration from a 64-byte memory location specified by
+/// "mem_addr". The tile configuration includes the tile type palette, the
+/// number of bytes per row, and the number of rows. If the specified
+/// palette_id is zero, that signifies the init state for both the tile
+/// config and the tile data, and the tiles are zeroed. Any invalid
+/// configurations will result in #GP fault.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_loadconfig(const void *__config)
+{
+  __builtin_ia32_tile_loadconfig(__config);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location
+/// specified by "mem_addr". The tile configuration includes the tile type
+/// palette, the number of bytes per row, and the number of rows. If tiles
+/// are not configured, all zeroes will be stored to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_storeconfig(void *__config)
+{
+  __builtin_ia32_tile_storeconfig(__config);
+}
+
+/// Release the tile configuration to return to the init state, which
+/// releases all storage it currently holds.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_release(void)
+{
+  __builtin_ia32_tilerelease();
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_loadd(dst, base, stride) \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), 
(__SIZE_TYPE__)(stride))
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig". This intrinsic provides a hint to the 
implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_stream_loadd(dst, base, stride) \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), 
(__SIZE_TYPE__)(stride))
+
+/// Store the tile specified by "src" to memory specifieid by "base" address 
and
+/// "stride" using the tile configuration previously configured via
+/// "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
+#define _tile_stored(dst, base, stride) \
+  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Zero the tile specified by "tdest".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param tile
+///    The destination tile to be zero. Max size is 1024 Bytes.
+#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
+
+/// Compute dot-product of bytes in tiles with a source/destination 
accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 
32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in 
"dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), 
(src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination 
accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), 
(src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination 
accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 
32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in 
"dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), 
(src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination 
accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer 
in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), 
(src1))
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) 
floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbf16ps(dst, src0, src1) \
+  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMXINTRIN_H */

diff  --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 6c38b578b30e..2a88c042d046 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -190,6 +190,9 @@
 #define bit_TSXLDTRK      0x00010000
 #define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000
+#define bit_AMXBF16       0x00400000
+#define bit_AMXTILE       0x01000000
+#define bit_AMXINT8       0x02000000
 
 /* Features in %eax for leaf 7 sub-leaf 1 */
 #define bit_AVX512BF16    0x00000020

diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index dd27ca2f6605..e9dff2310fdf 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -471,6 +471,11 @@ _storebe_i64(void * __P, long long __D) {
 #include <invpcidintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      
\
+    defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+#include <amxintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      
\
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 0ce84ea382b5..2b52415b2800 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -88,6 +88,7 @@
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <bitset>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -3607,6 +3608,64 @@ bool Sema::CheckX86BuiltinGatherScatterScale(unsigned 
BuiltinID,
          << Arg->getSourceRange();
 }
 
+enum { TileRegLow = 0, TileRegHigh = 7 };
+
+bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
+                                    ArrayRef<int> ArgNums) {
+  for (int ArgNum : ArgNums) {
+    if (SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh))
+      return true;
+  }
+  return false;
+}
+
+bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum) {
+  return SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh);
+}
+
+bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall,
+                                        ArrayRef<int> ArgNums) {
+  // Because the max number of tile register is TileRegHigh + 1, so here we use
+  // each bit to represent the usage of them in bitset.
+  std::bitset<TileRegHigh + 1> ArgValues;
+  for (int ArgNum : ArgNums) {
+    llvm::APSInt Arg;
+    SemaBuiltinConstantArg(TheCall, ArgNum, Arg);
+    int ArgExtValue = Arg.getExtValue();
+    assert((ArgExtValue >= TileRegLow || ArgExtValue <= TileRegHigh) &&
+           "Incorrect tile register num.");
+    if (ArgValues.test(ArgExtValue))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_x86_builtin_tile_arg_duplicate)
+             << TheCall->getArg(ArgNum)->getSourceRange();
+    ArgValues.set(ArgExtValue);
+  }
+  return false;
+}
+
+bool Sema::CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                                ArrayRef<int> ArgNums) {
+  return CheckX86BuiltinTileArgumentsRange(TheCall, ArgNums) ||
+         CheckX86BuiltinTileDuplicate(TheCall, ArgNums);
+}
+
+bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) 
{
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_tileloadd64:
+  case X86::BI__builtin_ia32_tileloaddt164:
+  case X86::BI__builtin_ia32_tilestored64:
+  case X86::BI__builtin_ia32_tilezero:
+    return CheckX86BuiltinTileArgumentsRange(TheCall, 0);
+  case X86::BI__builtin_ia32_tdpbssd:
+  case X86::BI__builtin_ia32_tdpbsud:
+  case X86::BI__builtin_ia32_tdpbusd:
+  case X86::BI__builtin_ia32_tdpbuud:
+  case X86::BI__builtin_ia32_tdpbf16ps:
+    return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
+  }
+}
 static bool isX86_32Builtin(unsigned BuiltinID) {
   // These builtins only work on x86-32 targets.
   switch (BuiltinID) {
@@ -3640,6 +3699,10 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo 
&TI, unsigned BuiltinID,
   if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall))
     return true;
 
+  // If the intrinsic has a tile arguments, make sure they are valid.
+  if (CheckX86BuiltinTileArguments(BuiltinID, TheCall))
+    return true;
+
   // For intrinsics which take an immediate value as part of the instruction,
   // range check them here.
   int i = 0, l = 0, u = 0;

diff  --git a/clang/test/CodeGen/AMX/amx.c b/clang/test/CodeGen/AMX/amx.c
new file mode 100644
index 000000000000..89b486f7a601
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown  
-target-feature +amx-int8  \
+// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | 
FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+void test_amx(void *data) {
+  //CHECK-LABEL: @test_amx
+  //CHECK: call void @llvm.x86.ldtilecfg(i8* %{{.*}})
+  //CHECK: call void @llvm.x86.sttilecfg(i8* %{{.*}})
+  //CHECK: call void @llvm.x86.tilerelease()
+  //CHECK: call void @llvm.x86.tilezero(i8 3)
+  //CHECK: call void @llvm.x86.tileloadd64(i8 4, i8* %{{.*}}, i64 8)
+  //CHECK: call void @llvm.x86.tileloaddt164(i8 0, i8* %{{.*}}, i64 1)
+  //CHECK: call void @llvm.x86.tilestored64(i8 0, i8* %{{.*}}, i64 1)
+  //CHECK: call void @llvm.x86.tdpbssd(i8 1, i8 2, i8 3)
+  //CHECK: call void @llvm.x86.tdpbsud(i8 1, i8 2, i8 3)
+  //CHECK: call void @llvm.x86.tdpbusd(i8 1, i8 2, i8 3)
+  //CHECK: call void @llvm.x86.tdpbuud(i8 1, i8 2, i8 3)
+  //CHECK: call void @llvm.x86.tdpbf16ps(i8 1, i8 2, i8 3)
+  _tile_loadconfig(data);
+  _tile_storeconfig(data);
+  _tile_release();
+  _tile_zero(3);
+  _tile_loadd(4, data, 8);
+  _tile_stream_loadd(0, data, 1);
+  _tile_stored(0, data, 1);
+  _tile_dpbssd(1, 2, 3);
+  _tile_dpbsud(1, 2, 3);
+  _tile_dpbusd(1, 2, 3);
+  _tile_dpbuud(1, 2, 3);
+  _tile_dpbf16ps(1, 2, 3);
+}

diff  --git a/clang/test/CodeGen/AMX/amx_errors.c 
b/clang/test/CodeGen/AMX/amx_errors.c
new file mode 100644
index 000000000000..13a2b33b5a0a
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx_errors.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown 
-target-feature +amx-tile -target-feature +amx-int8 -target-feature +amx-bf16 
-emit-llvm -fsyntax-only -verify
+
+#include <immintrin.h>
+
+void test_amx(void *data) {
+  _tile_zero(16); // expected-error {{argument value 16 is outside the valid 
range [0, 7]}}
+  _tile_loadd(19, data, 16); // expected-error {{argument value 19 is outside 
the valid range [0, 7]}}
+  _tile_stream_loadd(23, data, 1); // expected-error {{argument value 23 is 
outside the valid range [0, 7]}}
+  _tile_stored(88, data, 1); // expected-error {{argument value 88 is outside 
the valid range [0, 7]}}
+  _tile_dpbssd(16, 2, 3); // expected-error {{argument value 16 is outside the 
valid range [0, 7]}}
+  _tile_dpbssd(0, 16, 3); // expected-error {{argument value 16 is outside the 
valid range [0, 7]}}
+  _tile_dpbuud(0, 2, 16); // expected-error {{argument value 16 is outside the 
valid range [0, 7]}}
+  _tile_dpbsud(1, 1, 3); // expected-error {{tile arguments must refer to 
diff erent tiles}}
+  _tile_dpbsud(7, 1, 7); // expected-error {{tile arguments must refer to 
diff erent tiles}}
+  _tile_dpbsud(4, 3, 3); // expected-error {{tile arguments must refer to 
diff erent tiles}}
+  _tile_dpbf16ps(4, 3, 3); // expected-error {{tile arguments must refer to 
diff erent tiles}}
+}

diff  --git a/clang/test/CodeGen/AMX/amx_inline_asm.c 
b/clang/test/CodeGen/AMX/amx_inline_asm.c
new file mode 100644
index 000000000000..9d828f8ac94e
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx_inline_asm.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown  
-target-feature +amx-int8 -target-feature +amx-bf16 -emit-llvm -o - -Wall 
-Werror -pedantic | FileCheck %s --check-prefixes=CHECK,X86_64
+
+void f_tilemul(short a)
+{
+  //CHECK:  call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0   
\0A\09tileloadd 0(%rdx,%r14,4), %tmm6   \0A\09tdpbf16ps %tmm6, %tmm0, %tmm7    
\0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", 
"~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+  __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0   \n\t"
+                    "tileloadd 0(%%rdx,%%r14,4), %%tmm6   \n\t"
+                    "tdpbf16ps %%tmm6, %%tmm0, %%tmm7    \n\t"
+                    "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+          ::: "memory", "tmm0", "tmm6", "tmm7");
+}

diff  --git a/clang/test/Driver/x86-target-features.c 
b/clang/test/Driver/x86-target-features.c
index b96eed287bd9..817caeecd71e 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -232,3 +232,18 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-tsxldtrk %s 
-### -o %t.o 2>&1 | FileCheck --check-prefix=NO-TSXLDTRK %s
 // TSXLDTRK: "-target-feature" "+tsxldtrk"
 // NO-TSXLDTRK: "-target-feature" "-tsxldtrk"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile %s -### 
-o %t.o 2>&1 | FileCheck --check-prefix=AMX-TILE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile %s 
-### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AMX-TILE %s
+// AMX-TILE: "-target-feature" "+amx-tile"
+// NO-AMX-TILE: "-target-feature" "-amx-tile"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 %s -### 
-o %t.o 2>&1 | FileCheck --check-prefix=AMX-BF16 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 %s 
-### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-BF16 %s
+// AMX-BF16: "-target-feature" "+amx-bf16"
+// NO-AMX-BF16: "-target-feature" "-amx-bf16"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 %s -### 
-o %t.o 2>&1 | FileCheck --check-prefix=AMX-INT8 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 %s 
-### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AMX-INT8 %s
+// AMX-INT8: "-target-feature" "+amx-int8"
+// NO-AMX-INT8: "-target-feature" "-amx-int8"

diff  --git a/clang/test/Preprocessor/x86_amx_target_features.c 
b/clang/test/Preprocessor/x86_amx_target_features.c
new file mode 100644
index 000000000000..68a3d7f950b1
--- /dev/null
+++ b/clang/test/Preprocessor/x86_amx_target_features.c
@@ -0,0 +1,35 @@
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile -x c -E 
-dM -o - %s | FileCheck  -check-prefix=AMX-TILE %s
+
+// AMX-TILE: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 -x c -E 
-dM -o - %s | FileCheck -check-prefix=AMX-BF16 %s
+
+// AMX-BF16: #define __AMXBF16__ 1
+// AMX-BF16: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 -x c -E 
-dM -o - %s | FileCheck -check-prefix=AMX-INT8 %s
+
+// AMX-INT8: #define __AMXINT8__ 1
+// AMX-INT8: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile -x c 
-E -dM -o - %s | FileCheck  -check-prefix=NOAMX-TILE %s
+
+// NOAMX-TILE-NOT: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 -x c 
-E -dM -o - %s | FileCheck  -check-prefix=NOAMX-BF16 %s
+
+// NOAMX-BF16-NOT: #define __AMXBF16__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-bf16 
-mno-amx-tile -x c -E -dM -o - %s | FileCheck  -check-prefix=NOAMX-BF16 %s
+
+// NOAMX-BF16-NOT: #define __AMXTILE__ 1
+// NOAMX-BF16-NOT: #define __AMXBF16__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 -x c 
-E -dM -o - %s | FileCheck  -check-prefix=NOAMX-INT8 %s
+
+// NOAMX-INT8-NOT: #define __AMXINT8__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-int8 
-mno-amx-tile -x c -E -dM -o - %s | FileCheck  -check-prefix=NOAMX-INT8 %s
+
+// NOAMX-INT8-NOT: #define __AMXTILE__ 1
+// NOAMX-INT8-NOT: #define __AMXINT8__ 1

diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td 
b/llvm/include/llvm/IR/IntrinsicsX86.td
index b3bf18720595..3f86fd075d3a 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -4948,3 +4948,32 @@ let TargetPrefix = "x86" in {
   def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">,
               Intrinsic<[], [], []>;
 }
+//===----------------------------------------------------------------------===//
+// AMX - Intel AMX extensions
+
+let TargetPrefix = "x86" in {
+  def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">,
+              Intrinsic<[], [llvm_ptr_ty], []>;
+  def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">,
+              Intrinsic<[], [llvm_ptr_ty], []>;
+  def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">,
+              Intrinsic<[], [], []>;
+  def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">,
+              Intrinsic<[], [llvm_i8_ty], []>;
+  def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+  def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+  def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+  def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+  def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+  def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+  def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+  def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+}

diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp 
b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5a57ca7646ff..fb285376c580 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4435,8 +4435,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
       break;
     }
+    case Intrinsic::x86_tileloadd64:
+    case Intrinsic::x86_tileloaddt164:
+    case Intrinsic::x86_tilestored64: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
+      case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+      case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
+      }
+      // FIXME: Match displacement and scale.
+      unsigned TIndex = Node->getConstantOperandVal(2);
+      SDValue TReg = getI8Imm(TIndex, dl);
+      SDValue Base = Node->getOperand(3);
+      SDValue Scale = getI8Imm(1, dl);
+      SDValue Index = Node->getOperand(4);
+      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+      SDValue Chain = Node->getOperand(0);
+      MachineSDNode *CNode;
+      if (Opc == X86::PTILESTORED) {
+        SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
+        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      } else {
+        SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
+        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      }
+      ReplaceNode(Node, CNode);
+      return;
+    }
     }
-
     break;
   }
   case ISD::BRIND: {

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 88a563720c2a..d7a45f6fb7c4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33044,6 +33044,10 @@ 
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
+  auto TMMImmToTMMReg = [](unsigned Imm) {
+    assert (Imm < 8 && "Illegal tmm index");
+    return X86::TMM0 + Imm;
+  };
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TLS_addr32:
@@ -33326,6 +33330,67 @@ 
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return BB;
   }
+  case X86::PTDPBSSD:
+  case X86::PTDPBSUD:
+  case X86::PTDPBUSD:
+  case X86::PTDPBUUD:
+  case X86::PTDPBF16PS: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+    case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+    case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+    case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+    case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+    }
+
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+  case X86::PTILEZERO: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Imm = MI.getOperand(0).getImm();
+    BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+  case X86::PTILELOADD:
+  case X86::PTILELOADDT1:
+  case X86::PTILESTORED: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
+    case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+    case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
+    }
+
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    unsigned CurOp = 0;
+    if (Opc != X86::TILESTORED)
+      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+                 RegState::Define);
+
+    MIB.add(MI.getOperand(CurOp++)); // base
+    MIB.add(MI.getOperand(CurOp++)); // scale
+    MIB.add(MI.getOperand(CurOp++)); // index -- stride
+    MIB.add(MI.getOperand(CurOp++)); // displacement
+    MIB.add(MI.getOperand(CurOp++)); // segment
+
+    if (Opc == X86::TILESTORED)
+      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+                 RegState::Undef);
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
   }
 }
 

diff  --git a/llvm/lib/Target/X86/X86InstrAMX.td 
b/llvm/lib/Target/X86/X86InstrAMX.td
index deefb3eecf39..e26dd5050a23 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -18,9 +18,11 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "ldtilecfg\t$src", []>, VEX, T8PS;
+                       "ldtilecfg\t$src",
+                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
     def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "sttilecfg\t$src", []>, VEX, T8PD;
+                       "sttilecfg\t$src",
+                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
     def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
                       (ins sibmem:$src),
                       "tileloadd\t{$src, $dst|$dst, $src}", []>,
@@ -31,7 +33,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                         VEX, T8PD;
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
-                        "tilerelease", []>, VEX, T8PS;
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
     def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
                        (ins sibmem:$dst, TILE:$src),
                        "tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -39,6 +41,17 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
     def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
                      "tilezero\t$dst", []>,
                      VEX, T8XD;
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+      def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
+                                          sibmem:$src2), []>;
+      def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
+      def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
+                              [(int_x86_tilezero imm:$src)]>;
+    }
   } // SchedRW
 } // HasAMXTILE
 
@@ -62,6 +75,27 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
                       "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
                       VEX_4V, T8PS;
     }
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbssd imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbsud imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbusd imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbuud imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+    }
   }
 } // HasAMXTILE
 
@@ -72,5 +106,14 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       []>, VEX_4V, T8XS;
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
+                               u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdpbf16ps imm:$src1,
+                                 imm:$src2, imm:$src3)]>;
+    }
   }
 } // HasAMXTILE, HasAMXBF16

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll 
b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
new file mode 100644
index 000000000000..a415d9c15242
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile 
-mattr=+amx-bf16 -verify-machineinstrs | FileCheck %s
+
+define void @test_amx() {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tdpbf16ps %tmm7, %tmm4, %tmm3
+; CHECK-NEXT:    retq
+  call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
+  ret void
+}
+
+declare void @llvm.x86.tdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll 
b/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
new file mode 100644
index 000000000000..49e69aeab510
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 
-verify-machineinstrs | FileCheck %s
+
+define void @test_amx() {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+  call void @llvm.x86.tdpbssd(i8 3, i8 4, i8 7)
+; CHECK-NEXT:    tdpbssd %tmm7, %tmm4, %tmm3
+
+  call void @llvm.x86.tdpbsud(i8 3, i8 4, i8 7)
+; CHECK-NEXT:    tdpbsud %tmm7, %tmm4, %tmm3
+
+  call void @llvm.x86.tdpbusd(i8 3, i8 0, i8 7)
+; CHECK-NEXT:    tdpbusd %tmm7, %tmm0, %tmm3
+
+  call void @llvm.x86.tdpbuud(i8 3, i8 4, i8 1)
+; CHECK-NEXT:    tdpbuud %tmm1, %tmm4, %tmm3
+  ret void
+}
+
+declare void @llvm.x86.tdpbssd(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbsud(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbusd(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbuud(i8 %tile0, i8 %tile1, i8 %tile2)

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll 
b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
new file mode 100644
index 000000000000..6b8e040abb9a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile 
-verify-machineinstrs | FileCheck %s
+
+define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+  call void @llvm.x86.ldtilecfg(i8* %pointer)
+; CHECK-NEXT:    ldtilecfg (%rdi)
+
+  call void @llvm.x86.sttilecfg(i8* %pointer)
+; CHECK-NEXT:    sttilecfg (%rdi)
+
+  call void @llvm.x86.tilerelease()
+; CHECK-NEXT:    tilerelease
+
+  call void @llvm.x86.tilezero(i8 3)
+; CHECK-NEXT:    tilezero %tmm3
+
+  call void @llvm.x86.tileloadd64(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm3
+
+  call void @llvm.x86.tileloaddt164(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT:    tileloaddt1 (%rsi,%rdx), %tmm3
+
+  call void @llvm.x86.tilestored64(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT:    tilestored %tmm3, (%rsi,%rdx)
+  ret void
+}
+
+declare void @llvm.x86.tileloadd64(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.tileloaddt164(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.tilestored64(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.ldtilecfg(i8* %pointer)
+declare void @llvm.x86.sttilecfg(i8* %pointer)
+declare void @llvm.x86.tilerelease()
+declare void @llvm.x86.tilezero(i8 %tile)


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 939d830 - [X86-64] Support Intel AMX Intrinsic

Reply via email to