[PATCH] D79448: [X86] Allow Yz inline assembly constraint to choose ymm0 or zmm0 when avx/avx512 are enabled and type is 256 or 512 bits

2020-05-05 Thread Craig Topper via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG0fac1c191281: [X86] Allow Yz inline assembly constraint to 
choose ymm0 or zmm0 when… (authored by craig.topper).
Herald added a project: clang.

Changed prior to commit:
  https://reviews.llvm.org/D79448?vs=262233=262289#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D79448/new/

https://reviews.llvm.org/D79448

Files:
  clang/lib/Basic/Targets/X86.cpp
  clang/test/CodeGen/x86-inline-asm-v-constraint.c
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
  llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll

Index: llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
===
--- llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
+++ llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
@@ -70,3 +70,12 @@
   ret <16 x float> %0
 }
 
+define <16 x float> @testZMM0() {
+entry:
+; CHECK: vpternlogd $255, %zmm0, %zmm0, %zmm0
+  %zmm0 = alloca <16 x float>, align 64
+  %0 = call <16 x float> asm "vpternlogd $$255, $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+  store <16 x float> %0, <16 x float>* %zmm0, align 64
+  %1 = load <16 x float>, <16 x float>* %zmm0, align 64
+  ret <16 x float> %1
+}
Index: llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
===
--- llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
+++ llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
@@ -134,3 +134,13 @@
   ret <8 x float> %0
 }
 
+define <8 x float> @testYMM0() {
+; CHECK: vpcmpeqd %ymm0, %ymm0, %ymm0
+entry:
+  %ymm0 = alloca <8 x float>, align 32
+  %0 = call <8 x float> asm "vpcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+  store <8 x float> %0, <8 x float>* %ymm0, align 32
+  %1 = load <8 x float>, <8 x float>* %ymm0, align 32
+  ret <8 x float> %1
+}
+
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48046,7 +48046,9 @@
   // XMM0
   case 'z':
   case '0':
-if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
   return CW_SpecificReg;
 return CW_Invalid;
   // Conditional OpMask regs (AVX512)
@@ -48496,6 +48498,8 @@
 if (Subtarget.hasAVX())
   return std::make_pair(0U, ::VR256RegClass);
 break;
+  case MVT::v64i8:
+  case MVT::v32i16:
   case MVT::v8f64:
   case MVT::v16f32:
   case MVT::v16i32:
@@ -48521,7 +48525,42 @@
 case 'z':
 case '0':
   if (!Subtarget.hasSSE1()) break;
-  return std::make_pair(X86::XMM0, ::VR128RegClass);
+  switch (VT.SimpleTy) {
+  default: break;
+  // Scalar SSE types.
+  case MVT::f32:
+  case MVT::i32:
+return std::make_pair(X86::XMM0, ::FR32RegClass);
+  case MVT::f64:
+  case MVT::i64:
+return std::make_pair(X86::XMM0, ::FR64RegClass);
+  case MVT::f128:
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v2f64:
+return std::make_pair(X86::XMM0, ::VR128RegClass);
+  // AVX types.
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v8f32:
+  case MVT::v4f64:
+if (Subtarget.hasAVX())
+  return std::make_pair(X86::YMM0, ::VR256RegClass);
+break;
+  case MVT::v8f64:
+  case MVT::v16f32:
+  case MVT::v16i32:
+  case MVT::v8i64:
+if (Subtarget.hasAVX512())
+  return std::make_pair(X86::ZMM0, ::VR512_0_15RegClass);
+break;
+  }
+  break;
 case 'k':
   // This register class doesn't allocate k0 for masked vector operation.
   if (Subtarget.hasAVX512()) {
Index: clang/test/CodeGen/x86-inline-asm-v-constraint.c
===
--- clang/test/CodeGen/x86-inline-asm-v-constraint.c
+++ clang/test/CodeGen/x86-inline-asm-v-constraint.c
@@ -28,3 +28,28 @@
 #endif
   return _zmm0;
 }
+
+// SSE: call <4 x float> asm "pcmpeqd $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m128 testXMM0(void) {
+  __m128 xmm0;
+  __asm__("pcmpeqd %0, %0" :"=Yz"(xmm0));
+  return xmm0;
+}
+
+// AVX: call <8 x float> asm "vpcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m256 testYMM0(void) {
+  __m256 ymm0;
+#ifdef AVX
+  __asm__("vpcmpeqd %0, %0, %0" :"=Yz"(ymm0));
+#endif
+  return ymm0;
+}
+
+// AVX512: call <16 x float> 

[PATCH] D79448: [X86] Allow Yz inline assembly constraint to choose ymm0 or zmm0 when avx/avx512 are enabled and type is 256 or 512 bits

2020-05-05 Thread Craig Topper via Phabricator via cfe-commits
craig.topper created this revision.
craig.topper added reviewers: RKSimon, spatel, rnk, echristo.
Herald added a subscriber: hiraditya.
Herald added a project: LLVM.

gcc supports selecting ymm0/zmm0 for the Yz constraint when used with 256 or 
512 bit vector types.

Fixes PR45806


https://reviews.llvm.org/D79448

Files:
  clang/lib/Basic/Targets/X86.cpp
  clang/test/CodeGen/x86-inline-asm-v-constraint.c
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
  llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll

Index: llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
===
--- llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
+++ llvm/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
@@ -70,3 +70,12 @@
   ret <16 x float> %0
 }
 
+define <16 x float> @testZMM0() {
+entry:
+; CHECK: vpternlogd $255, %zmm0, %zmm0, %zmm0
+  %zmm0 = alloca <16 x float>, align 64
+  %0 = call <16 x float> asm "vpternlogd $$255, $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+  store <16 x float> %0, <16 x float>* %zmm0, align 64
+  %1 = load <16 x float>, <16 x float>* %zmm0, align 64
+  ret <16 x float> %1
+}
Index: llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
===
--- llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
+++ llvm/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
@@ -134,3 +134,13 @@
   ret <8 x float> %0
 }
 
+define <8 x float> @testYMM0() {
+; CHECK: vpcmpeqd %ymm0, %ymm0, %ymm0
+entry:
+  %ymm0 = alloca <8 x float>, align 32
+  %0 = call <8 x float> asm "vpcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+  store <8 x float> %0, <8 x float>* %ymm0, align 32
+  %1 = load <8 x float>, <8 x float>* %ymm0, align 32
+  ret <8 x float> %1
+}
+
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48041,7 +48041,9 @@
   // XMM0
   case 'z':
   case '0':
-if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
   return CW_SpecificReg;
 return CW_Invalid;
   // Conditional OpMask regs (AVX512)
@@ -48491,6 +48493,8 @@
 if (Subtarget.hasAVX())
   return std::make_pair(0U, ::VR256RegClass);
 break;
+  case MVT::v64i8:
+  case MVT::v32i16:
   case MVT::v8f64:
   case MVT::v16f32:
   case MVT::v16i32:
@@ -48516,7 +48520,42 @@
 case 'z':
 case '0':
   if (!Subtarget.hasSSE1()) break;
-  return std::make_pair(X86::XMM0, ::VR128RegClass);
+  switch (VT.SimpleTy) {
+  default: break;
+  // Scalar SSE types.
+  case MVT::f32:
+  case MVT::i32:
+return std::make_pair(X86::XMM0, ::FR32RegClass);
+  case MVT::f64:
+  case MVT::i64:
+return std::make_pair(X86::XMM0, ::FR64RegClass);
+  case MVT::f128:
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v2f64:
+return std::make_pair(X86::XMM0, ::VR128RegClass);
+  // AVX types.
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v8f32:
+  case MVT::v4f64:
+if (Subtarget.hasAVX())
+  return std::make_pair(X86::YMM0, ::VR256RegClass);
+break;
+  case MVT::v8f64:
+  case MVT::v16f32:
+  case MVT::v16i32:
+  case MVT::v8i64:
+if (Subtarget.hasAVX512())
+  return std::make_pair(X86::ZMM0, ::VR512_0_15RegClass);
+break;
+  }
+  break;
 case 'k':
   // This register class doesn't allocate k0 for masked vector operation.
   if (Subtarget.hasAVX512()) {
Index: clang/test/CodeGen/x86-inline-asm-v-constraint.c
===
--- clang/test/CodeGen/x86-inline-asm-v-constraint.c
+++ clang/test/CodeGen/x86-inline-asm-v-constraint.c
@@ -28,3 +28,28 @@
 #endif
   return _zmm0;
 }
+
+// SSE: call <4 x float> asm "cmpeqd $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m128 testXMM0(void) {
+  __m128 xmm0;
+  __asm__("pcmpeqd %0, %0" :"=Yz"(xmm0));
+  return xmm0;
+}
+
+// AVX: call <8 x float> asm "vcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m256 testYMM0(void) {
+  __m256 ymm0;
+#ifdef AVX
+  __asm__("vpcmpeqd %0, %0, %0" :"=Yz"(ymm0));
+#endif
+  return ymm0;
+}
+
+// AVX512: call <16 x float> asm "vpternlogd $$255, $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m512 testZMM0(void) {
+  __m512 zmm0;
+#ifdef AVX512
+