[PATCH] D144911: adding bf16 support to NVPTX

2023-06-28 Thread Benjamin Kramer via Phabricator via cfe-commits
bkramer accepted this revision.
bkramer added a comment.
This revision is now accepted and ready to land.

I don't have much knowledge about the PTX ABI, but everything here makes 
perfect sense to me.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-27 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

@bkramer Ben, PTAL when you get a chance.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Artem Belevich via Phabricator via cfe-commits
tra planned changes to this revision.
tra added a comment.

We're still missing clang-side tests for the new builtins.
Now that the intrinsics use `bfloat` we also need to change builtin signatures. 
Or change codegen to bitcast to/from bfloat to match the types.
To be continued next week.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 534130.
tra added a comment.

Fixed few missed places in bf16 lowering.
Changed intrinsic types to use bfloat type.
Auto-upgrade the old intrinsic variants.
Updated broken tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/IR/AutoUpgrade.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll
  llvm/test/CodeGen/NVPTX/convert-sm80.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72-autoupgrade.ll
  llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

Index: llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
===
--- llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -9,10 +9,10 @@
 declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
 declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
 declare <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
-declare i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16, i16)
-declare i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16, i16)
-declare i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32, i32)
-declare i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32, i32)
+declare bfloat @llvm.nvvm.fmin.xorsign.abs.bf16(bfloat, bfloat)
+declare bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2(<2 x bfloat>, <2 x bfloat>)
+declare <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(<2 x bfloat>, <2 x bfloat>)
 declare float @llvm.nvvm.fmin.xorsign.abs.f(float, float)
 declare float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float, float)
 declare float @llvm.nvvm.fmin.nan.xorsign.abs.f(float, float)
@@ -26,10 +26,10 @@
 declare <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
 declare <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
 declare <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
-declare i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16, i16)
-declare i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16, i16)
-declare i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32, i32)
-declare i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32, i32)
+declare bfloat @llvm.nvvm.fmax.xorsign.abs.bf16(bfloat, bfloat)
+declare bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2(<2 x bfloat>, <2 x bfloat>)
+declare <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(<2 x bfloat>, <2 x bfloat>)
 declare float @llvm.nvvm.fmax.xorsign.abs.f(float, float)
 declare float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float, float)
 declare float @llvm.nvvm.fmax.nan.xorsign.abs.f(float, float)
@@ -100,35 +100,35 @@
 }
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16
-define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
+define bfloat @fmin_xorsign_abs_bf16(bfloat %0, bfloat %1) {
   ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16
-  %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1)
-  ret i16 %res
+  %res = call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16(bfloat %0, bfloat %1)
+  ret bfloat %res
 }
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16
-define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+define bfloat @fmin_nan_xorsign_abs_bf16(bfloat %0, bfloat %1) {
   ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16
-  %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1)
-  ret i16 %res
+  %res = call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16(bfloat %0, bfloat %1)
+  ret bfloat %res
 }
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16x2
-define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+define <2 x bfloat> @fmin_xorsign_abs_bf16x2(<2 x bfloat> %0, <2 x bfloat> %1) {
   ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16x2
-  %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1)
-  ret i32 %res
+  %res = call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2(<2 x bfloat> %0, <2 x bfloat> %1)
+  ret <2 x bfloat> %res
 }
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2
-define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+define <2 x bfloat> @fmin_nan_xorsign_abs_bf16x2(<2 x bfloat> %0, <2 x bfloat> %1) {
   ; C

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Artem Belevich via Phabricator via cfe-commits
tra commandeered this revision.
tra edited reviewers, added: kushanam; removed: tra.
tra added a comment.
This revision now requires review to proceed.
Herald added a subscriber: bixia.

I've got a few more fixes for the patch.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 534050.
kushanam added a comment.

fixing the f16x2-instructions test


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll
  llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Index: llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
===
--- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -31,7 +31,9 @@
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: test_ret_const(
-; CHECK: mov.b32 [[R:%r[0-9+]]], 1073757184;
+; CHECK: mov.b16 [[A:%rs[0-9+]]], 0x4000;
+; CHECK: mov.b16 [[B:%rs[0-9+]]], 0x3C00;
+; CHECK: mov.b32 [[R:%r[0-9+]]], {[[B]], [[A]]};
 ; CHECK: st.param.b32[func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define <2 x half> @test_ret_const() #0 {
@@ -100,8 +102,10 @@
 ; CHECK-LABEL: test_fadd_imm_0(
 ; CHECK-DAG:  ld.param.b32[[A:%r[0-9]+]], [test_fadd_imm_0_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[I]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x4000;
+; CHECK-F16:mov.b16[[C:%rs[0-9+]]], 0x3C00;
+; CHECK-F16:mov.b32[[D:%r[0-9+]]], {[[C]], [[B]]};
+; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[D]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
@@ -122,8 +126,10 @@
 ; CHECK-LABEL: test_fadd_imm_1(
 ; CHECK-DAG:  ld.param.b32[[B:%r[0-9]+]], [test_fadd_imm_1_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 1073757184;
-; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[B]], [[I]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x4000;
+; CHECK-F16:mov.b16[[C:%rs[0-9+]]], 0x3C00;
+; CHECK-F16:mov.b32[[D:%r[0-9+]]], {[[C]], [[B]]};
+; CHECK-F16:add.rn.f16x2   [[R:%r[0-9]+]], [[A]], [[D]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
@@ -169,8 +175,10 @@
 ; CHECK-LABEL: test_fneg(
 ; CHECK-DAG:  ld.param.b32[[A:%r[0-9]+]], [test_fneg_param_0];
 ;
-; CHECK-F16:mov.b32[[I:%r[0-9+]]], 0;
-; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[I]], [[A]];
+; CHECK-F16:mov.b16[[B:%rs[0-9+]]], 0x;
+; CHECK-F16:mov.b32[[C:%r[0-9+]]], {[[B]], [[B]]};
+;
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%r[0-9]+]], [[C]], [[A]];
 ;
 ; CHECK-NOF16-DAG:  mov.b32{[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
 ; CHECK-NOF16-DAG:  cvt.f32.f16[[FA0:%f[0-9]+]], [[A0]]
Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-23 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

The latest patch revision still fails on a few LLVM tests:

  Failed Tests (3):
LLVM :: CodeGen/NVPTX/bf16-instructions.ll
LLVM :: CodeGen/NVPTX/f16x2-instructions.ll
LLVM :: CodeGen/NVPTX/math-intrins-sm80-ptx70.ll




Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:1308
 return TypeSplitVector;
-  if (VT == MVT::v2f16)
+  if (Isv2f16Orv2bf16Type((EVT)VT))
 return TypeLegal;

I do not think the cast is necessary. 




Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:595-596
 FromName, ".f16 \t$dst, $src;"), []>;
+def _bf16 :
+  NVPTXInst<(outs RC:$dst),
+(ins Int16Regs:$src, CvtMode:$mode),

tra wrote:
> While we're here, it also needs `Requires<[hasPTX<70>, hasSM<80>]>`
This is needed in *addition* to whatever predicate is supplied as an argument. 
E.g. when we do `defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;` conversion 
from f32 to bf16 should still be predicated on `[hasPTX<70>, hasSM<80>]`.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-22 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 533694.
kushanam added a comment.

Updating the bf16 conversion arc requirements


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_ext

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-20 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:559-568
-multiclass CVT_FROM_FLOAT_SM80 {
-def _f32 :
-  NVPTXInst<(outs RC:$dst),
-(ins Float32Regs:$src, CvtMode:$mode),
-!strconcat("cvt${mode:base}${mode:relu}.",
-FromName, ".f32 \t$dst, $src;"), []>,
-Requires<[hasPTX<70>, hasSM<80>]>;

This is where cvt.rn.relu.bf16.f32  was used to be generated before.

Now we've replaced it with `CVT_FROM_ALL` which does not know anything about 
`relu`.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:595-596
 FromName, ".f16 \t$dst, $src;"), []>;
+def _bf16 :
+  NVPTXInst<(outs RC:$dst),
+(ins Int16Regs:$src, CvtMode:$mode),

While we're here, it also needs `Requires<[hasPTX<70>, hasSM<80>]>`



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:601
 def _f32 :
   NVPTXInst<(outs RC:$dst),
 (ins Float32Regs:$src, CvtMode:$mode),

We may add an optional `list` argument  to the multiclass and 
do`defm CVT_bf16<... [hasPTX<70>, hasSM<80>]>`



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:603
 (ins Float32Regs:$src, CvtMode:$mode),
 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
 FromName, ".f32 \t$dst, $src;"), []>;

We also need to augment it with `${mode:relu}` 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-20 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 533051.
kushanam added a comment.

fixing bf16 test cases


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK:  

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-15 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td:1271-1287
-def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
-  (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
-def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
-  (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
-def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
-  (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
-def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),

tra wrote:
> Were these patterns removed intentionally? We still have intrinsics/builtins 
> defined in llvm/include/llvm/IR/IntrinsicsNVVM.td and still need to lower 
> them.
^^^ this question is still unanswered.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-13 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 531089.
kushanam added a comment.

renaming useBF16Math to hasBF16Math


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+;

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-13 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added a comment.
This revision is now accepted and ready to land.

LGTM with few nits. Thank you for your patience with revising the patch.




Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:629-631
+  const bool IsBFP16FP16x2NegAvailable = STI.getSmVersion() >= 80 &&
+ STI.getPTXVersion() >= 70 &&
+ STI.hasBF16Math();

IsBFP16FP16x2NegAvailable is no longer used and can be removed.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:159
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
+def useBF16Math: Predicate<"Subtarget->hasBF16Math()">;
 

Nit: I'd rename the record to `hasBF16Math` as the decision is based purely on 
whether we have particular features enabled, and not on whether user input 
allows us to use those instructions or not on the hardware where they are 
present.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-13 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 531068.
kushanam added a comment.

add constantFP for bf16


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extractelement <2 x bfloat> %a, i32 0
+  ret bfloat %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK: 

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-12 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Almost there. Just few cosmetic nits remaining.




Comment at: llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp:64-69
+  case 9:
 OS << "%h";
 break;
   case 8:
+  case 10:
 OS << "%hh";

tra wrote:
> Looks like I've forgot to remove those cases in my regclass patch. Will fix 
> it shortly.
Still not fixed.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:632-634
+  for (const auto &VT : {MVT::bf16, MVT::v2bf16})
+setOperationAction(ISD::FNEG, VT,
+   IsBFP16FP16x2NegAvailable ? Legal : Expand);

This could be just
```
setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
```




Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:159
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
+def useBFP16Math: Predicate<"Subtarget->allowBF16Math()">;
 

Nit: `useBF16Math` as in fp16 -> bf16.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1118
+[(set RC:$dst, (fneg (T RC:$src)))]>,
+Requires<[useFP16Math, hasPTX<70>, hasSM<80>, Pred]>;
+def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;

I think you need to use `useBF16Math` here.



Comment at: llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp:68
+
+bool NVPTXSubtarget::allowBF16Math() const { return hasBF16Math(); }

We do not need `allowBF16Math` any more.  Just use `hasBF16Math()`.



Comment at: llvm/lib/Target/NVPTX/NVPTXSubtarget.h:81
   bool allowFP16Math() const;
+  bool allowBF16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }

Not needed.



Comment at: llvm/test/CodeGen/NVPTX/bf16-instructions.ll:16
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3

Another test that would be useful is for `fadd bfloat %0, 1.0`



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-09 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 530123.
kushanam added a comment.

Adding new bf16 tests and refactoring the code with new conversion


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+@"bfloat_array" = addrspace(1) constant [4 x bfloat]
+[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
+  
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:  ld.param.b16[[A:%rs[0-9]+]], [test_extract_0_param_0];
+; CHECK:  st.param.b16[func_retval0+0], [[A]];
+; CHECK:  ret;
+
+define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
+  %e = extra

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-09 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:615
   // need to deal with.
   if (Vector.getSimpleValueType() != MVT::v2f16)
 return false;

This needs to be updated to include v2bf16


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-08 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Overall looks good with few minor nits and a couple of questions.




Comment at: llvm/include/llvm/IR/IntrinsicsNVVM.td:604
   def int_nvvm_f # operation # variant :
 ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],

tra wrote:
> tra wrote:
> > Availability of these new instructions is conditional on specific CUDA 
> > version and the GPU variant we're compiling for,
> > Such builtins are normally implemented on the clang size as a 
> > `TARGET_BUILTIN()` with appropriate constraints.
> > 
> > Without that `ClangBuiltin` may automatically add enough glue to make them 
> > available in clang unconditionally, which would result in compiler crashing 
> > if a user tries to use one of those builtins with a wrong GPU or CUDA 
> > version. We want to emit a diagnostics, not cause a compiler crash.
> > 
> > Usually such related LLVM and clang changes should be part of the same 
> > patch.
> > 
> > This applies to the new intrinsic variants added below, too.
> I do not think it's is done. 
> 
> Can you check what happens if you try to call any of bf16 builtins while 
> compiling for sm_60? Ideally we should produce a sensible error that the 
> builtin is not available.
> 
> I suspect we will fail in LLVM when we'll fail to lower the intrinsic, ot in 
> nvptx if we've managed to lower it to an instruction unsupported by sm_60.
OK. We'll leave conditional clang builtin handling to be fixed separately as 
it's not directly related to this patch.



Comment at: llvm/include/llvm/IR/IntrinsicsNVVM.td:878
 def int_nvvm_fma # variant : ClangBuiltin,
-  DefaultAttrsIntrinsic<[llvm_i16_ty],
-[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+  DefaultAttrsIntrinsic<[llvm_bfloat_ty],
+[llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty],

This changes signatures of existing intrinsics and builtins. While the change 
is correct, we should at least check that MLIR tests are still passing.




Comment at: llvm/include/llvm/IR/IntrinsicsNVVM.td:1244-1251
   def int_nvvm_ff2bf16x2_rn : ClangBuiltin<"__nvvm_ff2bf16x2_rn">,
Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
   def int_nvvm_ff2bf16x2_rn_relu : ClangBuiltin<"__nvvm_ff2bf16x2_rn_relu">,
   Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
   def int_nvvm_ff2bf16x2_rz : ClangBuiltin<"__nvvm_ff2bf16x2_rz">,
   Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
   def int_nvvm_ff2bf16x2_rz_relu : ClangBuiltin<"__nvvm_ff2bf16x2_rz_relu">,

We've removed the patterns matching these intrinsics in 
lib/Target/NVPTX/NVPTXIntrinsics.td so there's nothing to lower them to an 
instruction now. Was that intentional?



Comment at: llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp:64-69
+  case 9:
 OS << "%h";
 break;
   case 8:
+  case 10:
 OS << "%hh";

Looks like I've forgot to remove those cases in my regclass patch. Will fix it 
shortly.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:640
  ISD::FROUNDEVEN, ISD::FTRUNC}) {
+setOperationAction(Op, MVT::bf16, Legal);
 setOperationAction(Op, MVT::f16, Legal);

Nit: sometimes bf16 variants are added above fp16 variants, sometimes after. It 
would be nice to do it consistently. I guess we should just do a cleanup patch 
sorting these blocks in type order.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:2514-2516
+  if ((Isv2f16Orv2bf16Type(VT.getSimpleVT())) &&
   !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
   VT, *Store->getMemOperand()))

Unnecessary  `()`around `Isv2f16Orv2bf16Type(VT.getSimpleVT())`



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:2601
   // store them with st.v4.b32.
-  assert((EltVT == MVT::f16 || EltVT == MVT::bf16) &&
+  assert((Isf16Orbf16Type(EltVT.getSimpleVT())) &&
  "Wrong type for the vector.");

Ditto.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1316-1326
 defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
 defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
 defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
 defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
+defm BFMA16_ftz : FMA_BF16<"fma.rn.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
+defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
+defm BFMA16x2_ftz : FMA_BF16<"fma.rn.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;

Nit: align ':' across the block.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-08 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 529515.
kushanam added a comment.

updating the bf16 resigter types


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
   return {Intrinsic::fma, FTZ_MustBeOn, true};
+case Intrinsic::nvvm_fma_rn_bf16:
+  return {Intrinsic::fma, FTZ_

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-07 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 529494.
kushanam added a comment.

rebasing and readding the first commit changes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
   return {Intrinsic::fma, FTZ_MustBeOn, true};
+case Intri

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-06 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:615
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

kushanam wrote:
> tra wrote:
> > There's still something odd with this patch. The `setBF16OperationAction` 
> > is not in the upstream, but it does not show up in the diff on phabricator. 
> > 
> > Please do rebase on top of the LLVM and make sure that all your changes are 
> > on the git branch you use to send the patch to phabricator. If in doubt how 
> > to get `arc` to do it correctly, you can always create and upload the diff 
> > manually as described here: 
> > https://llvm.org/docs/Phabricator.html#requesting-a-review-via-the-web-interface
> It is in the first commit, isn't it?https://reviews.llvm.org/D144911?id=500896
It *was* in the first revision of the patch, but it's not in the current one.

The phabricator commit history tracks evolution of the single patch, not a 
dependent set of patches. If you need a dependent patch, that's done via 
submitting each patch individually (i.e. with its own phabricator ID) and then 
recording their relationship via  "Edit related revisions -> Edit child/parent 
revisions". After that you will see them arranged under "stack" sub-tab in the 
"Revision contents" section



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-06 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:615
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

tra wrote:
> There's still something odd with this patch. The `setBF16OperationAction` is 
> not in the upstream, but it does not show up in the diff on phabricator. 
> 
> Please do rebase on top of the LLVM and make sure that all your changes are 
> on the git branch you use to send the patch to phabricator. If in doubt how 
> to get `arc` to do it correctly, you can always create and upload the diff 
> manually as described here: 
> https://llvm.org/docs/Phabricator.html#requesting-a-review-via-the-web-interface
It is in the first commit, isn't it?https://reviews.llvm.org/D144911?id=500896


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:615
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);

There's still something odd with this patch. The `setBF16OperationAction` is 
not in the upstream, but it does not show up in the diff on phabricator. 

Please do rebase on top of the LLVM and make sure that all your changes are on 
the git branch you use to send the patch to phabricator. If in doubt how to get 
`arc` to do it correctly, you can always create and upload the diff manually as 
described here: 
https://llvm.org/docs/Phabricator.html#requesting-a-review-via-the-web-interface



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:689
 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));

Should it be `set*BF*16OperationAction` ?



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:692
 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
-  }
-  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
   }

ditto.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

FYI https://reviews.llvm.org/D151601 has landed in 
https://github.com/llvm/llvm-project/commit/dc90f42ea7b4f6d9e643f5ad2ba663eba2f9e421.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam added a comment.

In D144911#4389187 , @manishucsd 
wrote:

> I tried this patch to resolve lowering `nvvm.mma.sync` on bf16 operands for 
> the PR 13858 
>
>   nvvm.mma.sync A[%293, %295, %297, %299]  B[%392, %394]  C[%484, %485, %487, 
> %488]  {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, 
> multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = 
> #nvvm.mma_type, shape = #nvvm.shape} : 
> (vector<2xbf16>, vector<2xbf16>, f32) -> !llvm.struct<(f32, f32, f32, f32)>
>
> I fail to compile this patch. Please find the compilation error below:
>
>   [build] ./llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1117:40: 
> error: Variable not defined: 'hasPTX70'
>   [build] Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
>   [build]^
>
>   

Could you try again, I rebased and refactored the code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 528526.
kushanam added a comment.

adding min and max for bf16 and refactoring the code


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp


Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -150,23 +150,11 @@
 }
 
 static bool Isv2f16Orv2bf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::v2f16:
-  case MVT::v2bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::v2f16 || VT.SimpleTy == MVT::v2bf16);
 }
 
 static bool Isf16Orbf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::f16:
-  case MVT::bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16);
 }
 
 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -624,9 +612,6 @@
   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
-  }
-
-  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
   }
@@ -693,20 +678,18 @@
   };
   for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
+setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
 setOperationAction(Op, MVT::f32, Legal);
 setOperationAction(Op, MVT::f64, Legal);
 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
+setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
   }
   for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
-  }
-  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
-setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
+setFP16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
   }
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
Index: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1294,14 +1294,11 @@
 NumElts = EltVT.getVectorNumElements();
 EltVT = EltVT.getVectorElementType();
 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
-if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
-  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
-  EltVT = MVT::v2f16;
-  NumElts /= 2;
-} else if (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) {
-  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
-  EltVT = MVT::v2bf16;
-  NumElts /= 2;
+if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || 
+(EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16)) {
+  assert(NumElts % 2 == 0 && "Vector must have even number of 
elements");
+  EltVT = N->getValueType(0);
+  NumElts /= 2;
 }
   }
 


Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -150,23 +150,11 @@
 }
 
 static bool Isv2f16Orv2bf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::v2f16:
-  case MVT::v2bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::v2f16 || VT.SimpleTy == MVT::v2bf16);
 }
 
 static bool Isf16Orbf16Type(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-return false;
-  case MVT::f16:
-  case MVT::bf16:
-return true;
-  }
+  return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16);
 }
 
 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -624,9 +612,6 @@
   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
 setFP16OperationAction(Op, MVT::f16, Legal, Promote)

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-05 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 528475.
kushanam added a comment.

Rebasing the D144911  patch


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td


Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,17 +998,17 @@
 FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
   [hasPTX<70>, hasSM<80>]>,
 
-FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX70, 
hasSM80]>,
+FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX<70>, 
hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_bf16", int_nvvm_fma_rn_ftz_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_sat_bf16", int_nvvm_fma_rn_sat_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_sat_bf16", int_nvvm_fma_rn_ftz_sat_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_relu_bf16", int_nvvm_fma_rn_ftz_relu_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 
 FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Float16x2Regs,
   [hasPTX<42>, hasSM<53>]>,
@@ -1022,10 +1022,10 @@
   [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX<70>, hasSM<80>]>,
-FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, Int32Regs,
-  [hasPTX70, hasSM80]>,
-FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, Int32Regs,
-  [hasPTX70, hasSM80]>
+FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
+  [hasPTX<70>, hasSM<80>]>,
+FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
+  [hasPTX<70>, hasSM<80>]>
   ] in {
 def P.Variant :
   F_MATH_3,
-Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
+Requires<[useFP16Math, hasPTX<70>, hasSM<80>, Pred]>;
 def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, BFloat16Regs, 
doF32FTZ>;
 def BFNEG16   : FNEG_BF16_F16X2<"neg.bf16", bf16, BFloat16Regs, True>;
 def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, BFloat16x2Regs, 
doF32FTZ>;
@@ -3337,30 +3337,6 @@
"  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
[(set BFloat16Regs:$dst,
  (extractelt (v2bf16 BFloat16x2Regs:$src), 
1))]>;
-
-  // // Coalesce two bf16 registers into bf16x2
-  // def BuildBF16x2 : NVPTXInst<(outs BFloat16x2Regs:$dst),
-  //(ins BFloat16Regs:$a, BFloat16Regs:$b),
-  //"mov.b32 \t$dst, {{$a, $b}};",
-  //[(set (v2bf16 BFloat16x2Regs:$dst),
-  //  (build_vector (bf16 BFloat16Regs:$a), (bf16 
BFloat16Regs:$b)))]>;
-
-  // // Directly initializing underlying the b32 register is one less SASS
-  // // instruction than than vector-packing move.
-  // def BuildBF16x2i : NVPTXInst<(outs BFloat16x2Regs:$dst), (ins 
i32imm:$src),
-  // "mov.b32 \t$dst, $src;",
-  // []>;
-
-  // // Split f16x2 into two f16 registers.
-  // def SplitBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, BFloat16Regs:$hi),
-  // (ins BFloat16x2Regs:$src),
-  // "mov.b32 \t{{$lo, $hi}}, $src;",
-  // []>;
-  // // Split an i32 into two f16
-  // def SplitI32toBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, 
BFloat16Regs:$hi),
-  //  (ins Int32Regs:$src),
-  //  "mov.b32 \t{{$lo, $hi}}, $src;",
-  //  []>;
 }
 
 // Count leading zeros


Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,17 +998,17 @@
 FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
   [hasPTX<70>, hasSM<80>]>,
 
-FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX70, hasSM80]>,
+FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, BFloat16Regs, [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_ftz_bf16", int_nvvm_fma_rn_ftz_bf16, BFloat16Regs,
-  [hasPTX70, hasSM80]>,
+  [hasPTX<70>, hasSM<80>]>,
 FMA_TUPLE<"_rn_sat_bf16", int_nvvm_fma_rn_sat_bf16, BFloat16Reg

[PATCH] D144911: adding bf16 support to NVPTX

2023-06-02 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

In D144911#4389187 , @manishucsd 
wrote:

> I fail to compile this patch. Please find the compilation error below:
>
>   [build] ./llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1117:40: 
> error: Variable not defined: 'hasPTX70'
>   [build] Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
>   [build]^

You need to update your patch. Recent LLVM changes have changed `hasPTXab` -> 
`hasPTX`, and similarly `hasSMab` > `hasSM`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-06-01 Thread Manish Gupta via Phabricator via cfe-commits
manishucsd added a comment.

I tried this patch to resolve lowering `nvvm.mma.sync` on bf16 operands for the 
PR 13858 

  nvvm.mma.sync A[%293, %295, %297, %299]  B[%392, %394]  C[%484, %485, %487, 
%488]  {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, 
multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = 
#nvvm.mma_type, shape = #nvvm.shape} : 
(vector<2xbf16>, vector<2xbf16>, f32) -> !llvm.struct<(f32, f32, f32, f32)>

I fail to compile this patch. Please find the compilation error below:

  [build] ./llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:1117:40: 
error: Variable not defined: 'hasPTX70'
  [build] Requires<[useFP16Math, hasPTX70, hasSM80, Pred]>;
  [build]^

  


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-26 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Here's a rough proof-of-concept patch coalescing i16/f16/bf16 to use the same 
Int16Regs register class: https://reviews.llvm.org/D151601

The changes are largely mechanical, replacing `%h` -> `%rs` in the tests and 
eliminating special cases we previously had for Float16Registers. I'll extend 
the patch to v2f16/Int32Regs next week.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-26 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/include/llvm/IR/IntrinsicsNVVM.td:604
   def int_nvvm_f # operation # variant :
 ClangBuiltin,
 DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],

tra wrote:
> Availability of these new instructions is conditional on specific CUDA 
> version and the GPU variant we're compiling for,
> Such builtins are normally implemented on the clang size as a 
> `TARGET_BUILTIN()` with appropriate constraints.
> 
> Without that `ClangBuiltin` may automatically add enough glue to make them 
> available in clang unconditionally, which would result in compiler crashing 
> if a user tries to use one of those builtins with a wrong GPU or CUDA 
> version. We want to emit a diagnostics, not cause a compiler crash.
> 
> Usually such related LLVM and clang changes should be part of the same patch.
> 
> This applies to the new intrinsic variants added below, too.
I do not think it's is done. 

Can you check what happens if you try to call any of bf16 builtins while 
compiling for sm_60? Ideally we should produce a sensible error that the 
builtin is not available.

I suspect we will fail in LLVM when we'll fail to lower the intrinsic, ot in 
nvptx if we've managed to lower it to an instruction unsupported by sm_60.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:1297-1304
 if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
   assert(NumElts % 2 == 0 && "Vector must have even number of elements");
   EltVT = MVT::v2f16;
   NumElts /= 2;
+} else if (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) {
+  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
+  EltVT = MVT::v2bf16;

These could be collapsed into 
```
if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || 
 (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) ) {
  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
  EltVT = N->getValueType(0);
  NumElts /= 2;
}
```



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:147-153
+  switch (VT.SimpleTy) {
+  default:
+return false;
+  case MVT::v2f16:
+  case MVT::v2bf16:
+return true;
+  }

It can be simplified to just `return (VT.SimpleTy == MVT::v2f16 || VT.SimpleTy 
== MVT::v2bf16);`




Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:156
+
+static bool Isf16Orbf16Type(MVT VT) {
+  switch (VT.SimpleTy) {

ditto.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:623
 
+  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+setBF16OperationAction(Op, MVT::bf16, Legal, Promote);

Fold it into the loop above.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:699
   }
+  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);

Fold into the loop processing `{ISD::FMINNUM, ISD::FMAXNUM}` above.

Also, do we want/need to add bf16 handling for `{ISD::FMINIMUM, ISD::FMAXIMUM}` 
too?

The LLVM's choice of constants `FMINIMUM` vs `FMINNUM` is rather unfortunate -- 
it's so easy to misread one for another.



Comment at: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp:700-703
+setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Promote), Promote);
+setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);
+setBF16OperationAction(Op, MVT::bf16, GetMinMaxAction(Expand), Expand);
+setBF16OperationAction(Op, MVT::v2bf16, GetMinMaxAction(Expand), Expand);

I'm not sure what's going on here. Should it be Promote for bf16 and Expand for 
v2bf16? Why do we have two other entries, one of them trying to Expand bf16?



Comment at: llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td:65-66
+def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
+def BFloat16Regs : NVPTXRegClass<[bf16], 16, (add (sequence "H%u", 0, 4))>;
+def BFloat16x2Regs : NVPTXRegClass<[v2bf16], 32, (add (sequence "HH%u", 0, 
4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;

I suspect this may be a problem.

What PTX do we end up generating if we have a function that needs to use both 
f16 and bf16 registers? I suspect we may end up with defining conflicting sets 
of registers.

I still do not think that we need a spearate register class for bf16 and both 
bf16 and fp16 should be using a generic opaque 16/32 bit register types (or, 
even better, generic Int16/Int32 registers. 

RegClass accepts multiple type values, so it may be as simple as using `def 
Int16Regs : NVPTXRegClass<[i16,f16,bf16], 16, (add (sequence "RS%u", 0, 4))>;` 
and adjusting existing use cases.

That should probably be done as a separ

[PATCH] D144911: adding bf16 support to NVPTX

2023-05-25 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 525896.
kushanam added a comment.

removing commented td entry


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
  

[PATCH] D144911: adding bf16 support to NVPTX

2023-05-25 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 525883.
kushanam added a comment.

rebasing the diff based on the branch HEAD


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
  llvm/lib/Target/NVPTX/NVPTXMCExpr.h
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
  llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
  llvm/lib/Target/NVPTX/NVPTXSubtarget.h
  llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
  llvm/test/CodeGen/NVPTX/bf16-instructions.ll

Index: llvm/test/CodeGen/NVPTX/bf16-instructions.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
+
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-NEXT: add.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fadd(bfloat %0, bfloat %1) {
+  %3 = fadd bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b16[[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG:  ld.param.b16[[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-NEXT: sub.rn.bf16 [[R:%f[0-9]+]], [[A]], [[B]];
+; CHECK-NEXT: st.param.b16[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define bfloat @test_fsub(bfloat %0, bfloat %1) {
+  %3 = fsub bfloat %0, %1 
+  ret bfloat %3
+}
+
+; CHECK-LABEL: test_faddx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_faddx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_faddx2_param_1];
+; CHECK-NEXT: add.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fsubx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fsubx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fsubx2_param_1];
+; CHECK-NEXT: sub.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fmulx2(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fmulx2_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fmulx2_param_1];
+; CHECK-NEXT: mul.rn.bf16x2   [[R:%f[0-9]+]], [[A]], [[B]];
+
+; CHECK:  st.param.b32[func_retval0+0], [[R]];
+; CHECK:  ret;
+
+define <2 x bfloat> @test_fmul(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32[[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32[[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.bf16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.bf16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32  [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32  [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.bf16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32[func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+
+define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
+  %r = fdiv <2 x bfloat> %a, %b
+  ret <2 x bfloat> %r
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -204,6 +204,14 @@
   return {Intrinsic::fma, FTZ_MustBeOff, true};
 case Intrinsic::nvvm_fma_rn_ft

[PATCH] D144911: adding bf16 support to NVPTX

2023-05-22 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp:315-318
-} else if (RC == &NVPTX::BFloat16RegsRegClass) {
-  Ret = (9 << 28);
-} else if (RC == &NVPTX::BFloat16x2RegsRegClass) {
-  Ret = (10 << 28);

There's still something odd with the patch. It appears that it's a diff vs a 
previous set of the changes which did introduce BFloat16RegsRegClass.

Can you please update the diff to be vs the recent LLVM tree HEAD?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-18 Thread Jacques Pienaar via Phabricator via cfe-commits
jpienaar added inline comments.



Comment at: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td:3352
 
-  // Coalesce two bf16 registers into bf16x2
-  def BuildBF16x2 : NVPTXInst<(outs BFloat16x2Regs:$dst),
- (ins BFloat16Regs:$a, BFloat16Regs:$b),
- "mov.b32 \t$dst, {{$a, $b}};",
- [(set (v2bf16 BFloat16x2Regs:$dst),
-   (build_vector (bf16 BFloat16Regs:$a), (bf16 
BFloat16Regs:$b)))]>;
-
-  // Directly initializing underlying the b32 register is one less SASS
-  // instruction than than vector-packing move.
-  def BuildBF16x2i : NVPTXInst<(outs BFloat16x2Regs:$dst), (ins i32imm:$src),
-  "mov.b32 \t$dst, $src;",
-  []>;
-
-  // Split f16x2 into two f16 registers.
-  def SplitBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, BFloat16Regs:$hi),
-  (ins BFloat16x2Regs:$src),
-  "mov.b32 \t{{$lo, $hi}}, $src;",
-  []>;
-  // Split an i32 into two f16
-  def SplitI32toBF16x2  : NVPTXInst<(outs BFloat16Regs:$lo, BFloat16Regs:$hi),
-   (ins Int32Regs:$src),
-   "mov.b32 \t{{$lo, $hi}}, $src;",
-   []>;
+  // // Coalesce two bf16 registers into bf16x2
+  // def BuildBF16x2 : NVPTXInst<(outs BFloat16x2Regs:$dst),

Could these commented parts be dropped?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-18 Thread Jacques Pienaar via Phabricator via cfe-commits
jpienaar added a comment.

In D144911#4347029 , @kushanam wrote:

> In D144911#4340097 , @jpienaar 
> wrote:
>
>> Thanks Artem for explaining,
>
> I think somehow, Phabricator pushes my changes to 
> https://reviews.llvm.org/D149976 instead. The changes are in the patch for 
> now, Is it possible to continue the review there instead?

This seems merged again, is that correct?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D144911: adding bf16 support to NVPTX

2023-05-17 Thread Kushan Ahmadian via Phabricator via cfe-commits
kushanam updated this revision to Diff 523111.
kushanam added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Adding clang directives and removing bf16 registers


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D144911/new/

https://reviews.llvm.org/D144911

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp

Index: llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
===
--- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -29,14 +29,13 @@
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return ".f32";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 // Ideally fp16 registers should be .f16, but this syntax is only
 // supported on sm_53+. On the other hand, .b16 registers are
 // accepted for all supported fp16 instructions on all GPU
 // variants, so we can use them instead.
 return ".b16";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
 return ".f64";
@@ -74,10 +73,9 @@
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass)
 return "%f";
-  if (RC == &NVPTX::Float16RegsRegClass || RC == &NVPTX::BFloat16RegsRegClass)
+  if (RC == &NVPTX::Float16RegsRegClass)
 return "%h";
-  if (RC == &NVPTX::Float16x2RegsRegClass ||
-  RC == &NVPTX::BFloat16x2RegsRegClass)
+  if (RC == &NVPTX::Float16x2RegsRegClass)
 return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
 return "%fd";
Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -998,9 +998,6 @@
 FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
   Float16x2Regs, [hasPTX70, hasSM80]>,
 
-// FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, BFloat16Regs,
-//   [hasPTX70, hasSM80]>,
-
 FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, BFloat16x2Regs,
   [hasPTX70, hasSM80]>,
 FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, BFloat16x2Regs,
@@ -1254,24 +1251,6 @@
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
   (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
 
-// def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
-// def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
-// def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
-//   (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
-
-// def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
-// def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
-// def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
-// def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
-//   (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
-
 def CVT_tf32_f32 :
NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
"cvt.rna.tf32.f32 \t$dest, $a;",
@@ -1387,11 +1366,6 @@
 def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
   (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
-// def : Pat<(int_nvvm_bf2h_rn_ftz Float32Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 Float32Regs:$a, CvtRN_FTZ))>;
-// def : Pat<(int_nvvm_f2h_rn BFloat16Regs:$a),
-//   (BITCONVERT_16_BF2I (CVT_bf16_f32 BFloat16Regs:$a, CvtRN))>;
-
 //
 // Bitcast
 //
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -656,15 +656,6 @@
   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
 "cvt.s64.s32 \t$dst, $src;", []>;
 
-multiclass CVT_FROM_FLOAT_SM80 {
-def _f32 :
-  NVPTXInst<(outs RC:$dst),
-(ins Float32Regs:$src, CvtMod