[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-05 Thread Christian Sigg via Phabricator via cfe-commits
csigg added a comment.

This diff was recommited in 400fef081adbafc358858709861cdb14303de0e9 
.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-04 Thread Christian Sigg via Phabricator via cfe-commits
csigg added a comment.

Thanks Mehdi for reverting.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-04 Thread Mehdi AMINI via Phabricator via cfe-commits
mehdi_amini added a comment.

The shared library build was broken, I had to revert: 
https://lab.llvm.org/buildbot/#/builders/61/builds/27377


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-04 Thread Christian Sigg via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rGbcfc0a905101: [MLIR][GPU] Replace fdiv on fp16 with promoted 
(fp32) multiplication with… (authored by csigg).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3386,7 +3386,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -llvm-optimize-for-nvvm-target | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,6 +29,13 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm_nvvm_barrier0() {
   // CHECK: nvvm.barrier0
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
@@ -0,0 +1,97 @@
+//===- OptimizeForNVVM.cpp - Optimize LLVM IR for NVVM -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h"
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-03 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 434142.
csigg added a comment.

Fix.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3379,7 +3379,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/optimize-for-nvvm.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -llvm-optimize-for-nvvm-target | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,6 +29,13 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm_nvvm_barrier0() {
   // CHECK: nvvm.barrier0
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
@@ -0,0 +1,97 @@
+//===- OptimizeForNVVM.cpp - Optimize LLVM IR for NVVM -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h"
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one
+// (conditional) Newton iteration.
+//
+// This as accurate as promoting the division to fp32 in the NVPTX backend, but

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-06-02 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 433772.
csigg added a comment.

Rename pass.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3379,7 +3379,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,6 +29,13 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm_nvvm_barrier0() {
   // CHECK: nvvm.barrier0
Index: mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -llvm-optimize-for-nvvm-target | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
@@ -0,0 +1,100 @@
+//===- OptimizeNVVM.cpp - Optimize NVVM IR -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one
+// (conditional) Newton iteration.
+//
+// This as accurate as promoting the 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-31 Thread Stephan Herhut via Phabricator via cfe-commits
herhut accepted this revision.
herhut added a comment.
This revision is now accepted and ready to land.

Separate pass works for me.




Comment at: mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td:19
 
+def NVVMOptimize : Pass<"nvvm-optimize"> {
+  let summary = "Optimize NVVM IR";

Maybe `llvm-optimize-for-nvvm`? Or even `llvm-optimize-for-nvvm-target`? 

This does not really optimize `nvvm` but rewrites `llvm` ir. 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-30 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 432894.
csigg added a comment.

Fix.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3374,7 +3374,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,6 +29,13 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm_nvvm_barrier0() {
   // CHECK: nvvm.barrier0
Index: mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -nvvm-optimize | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
@@ -0,0 +1,99 @@
+//===- OptimizeNVVM.cpp - Optimize NVVM IR -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h"
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one
+// (conditional) Newton iteration.
+//
+// This as accurate as promoting the division to fp32 in the NVPTX 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-30 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 432881.
csigg added a comment.

Rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3374,7 +3374,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,8 +29,15 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
-func.func @llvm_nvvm_barrier0() {
+func.func @llvm.nvvm.barrier0() {
   // CHECK: nvvm.barrier0
   nvvm.barrier0
   llvm.return
Index: mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -nvvm-optimize | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
@@ -0,0 +1,99 @@
+//===- OptimizeNVVM.cpp - Optimize NVVM IR -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h"
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one
+// (conditional) Newton iteration.
+//
+// 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-30 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 432880.
csigg added a comment.
Herald added a subscriber: mgorny.

Make fdiv rewrite an NVVM transform pass instead.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/include/mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
  mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
  mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
  mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
  mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir
  utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3373,7 +3373,9 @@
 ":IR",
 ":LLVMDialect",
 ":LLVMPassIncGen",
+":NVVMDialect",
 ":Pass",
+":Transforms",
 ],
 )
 
Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -32,6 +32,14 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
+// CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
   nvvm.barrier0
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -28,6 +28,14 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
+// CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm.nvvm.barrier0() {
   // CHECK: nvvm.barrier0
   nvvm.barrier0
Index: mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
===
--- /dev/null
+++ mlir/test/Dialect/LLVMIR/nvvm-optimize.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -nvvm-optimize | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fdiv_fp16
+llvm.func @fdiv_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK-DAG: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+  // CHECK-DAG: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+  // CHECK-DAG: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+  // CHECK-DAG: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+  // CHECK-DAG: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %[[rhs]] : f32
+  // CHECK-DAG: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+  // CHECK-DAG: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+  // CHECK-DAG: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+  // CHECK-DAG: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+  // CHECK-DAG: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+  // CHECK-DAG: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+  // CHECK-DAG: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+  // CHECK-DAG: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+  // CHECK-DAG: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+  // CHECK-DAG: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+  %result = llvm.fdiv %arg0, %arg1 : f16
+  // CHECK: llvm.return %[[result]] : f16
+  llvm.return %result : f16
+}
Index: mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
===
--- /dev/null
+++ mlir/lib/Dialect/LLVMIR/Transforms/OptimizeNVVM.cpp
@@ -0,0 +1,99 @@
+//===- OptimizeNVVM.cpp - Optimize NVVM IR -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OptimizeNVVM.h"
+#include "PassDetail.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+
+namespace {
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-30 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 432871.
csigg added a comment.

Rebase.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
  mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir

Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -33,6 +33,13 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
Index: mlir/test/Dialect/LLVMIR/nvvm.mlir
===
--- mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -29,6 +29,13 @@
   llvm.return %0 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+func.func @nvvm_rcp(%arg0: f32) -> f32 {
+  // CHECK: nvvm.rcp.approx.ftz.f %arg0 : f32
+  %0 = nvvm.rcp.approx.ftz.f %arg0 : f32
+  llvm.return %0 : f32
+}
+
 // CHECK-LABEL: @llvm_nvvm_barrier0
 func.func @llvm_nvvm_barrier0() {
   // CHECK: nvvm.barrier0
Index: mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
===
--- mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -488,3 +488,30 @@
   }
 }
 
+// -
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_divf_fp16
+  func.func @gpu_divf_fp16(%arg0 : f16, %arg1 : f16) -> f16 {
+// CHECK: %[[lhs:.*]] = llvm.fpext %arg0 : f16 to f32
+// CHECK: %[[rhs:.*]] = llvm.fpext %arg1 : f16 to f32
+// CHECK: %[[rcp:.*]] = nvvm.rcp.approx.ftz.f %1 : f32
+// CHECK: %[[approx:.*]]  = llvm.fmul %[[lhs]], %[[rcp]] : f32
+// CHECK: %[[neg:.*]] = llvm.fneg %[[rhs]] : f32
+// CHECK: %[[err:.*]] = "llvm.intr.fma"(%[[approx]], %[[neg]], %[[lhs]]) : (f32, f32, f32) -> f32
+// CHECK: %[[refined:.*]] = "llvm.intr.fma"(%[[err]], %[[rcp]], %[[approx]]) : (f32, f32, f32) -> f32
+// CHECK: %[[mask:.*]]= llvm.mlir.constant(2139095040 : ui32) : i32
+// CHECK: %[[cast:.*]]= llvm.bitcast %[[approx]] : f32 to i32
+// CHECK: %[[exp:.*]] = llvm.and %[[cast]], %[[mask]] : i32
+// CHECK: %[[c0:.*]]  = llvm.mlir.constant(0 : ui32) : i32
+// CHECK: %[[is_zero:.*]] = llvm.icmp "eq" %[[exp]], %[[c0]] : i32
+// CHECK: %[[is_mask:.*]] = llvm.icmp "eq" %[[exp]], %[[mask]] : i32
+// CHECK: %[[pred:.*]]= llvm.or %[[is_zero]], %[[is_mask]] : i1
+// CHECK: %[[select:.*]]  = llvm.select %[[pred]], %[[approx]], %[[refined]] : i1, f32
+// CHECK: %[[result:.*]]  = llvm.fptrunc %[[select]] : f32 to f16
+%result = arith.divf %arg0, %arg1 : f16
+// CHECK: llvm.return %[[result]] : f16
+func.return %result : f16
+  }
+}
+
Index: mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
===
--- mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -148,6 +148,62 @@
   }
 };
 
+// Replaces fdiv on fp16 with fp32 multiplication with reciprocal plus one
+// (conditional) Newton iteration.
+//
+// This as accurate as promoting the division to fp32 in the NVPTX backend, but
+// faster because it performs less Newton iterations, avoids the slow path
+// for e.g. denormals, and allows reuse of the reciprocal for multiple divisions
+// by the same divisor.
+struct ExpandDivF16 : public ConvertOpToLLVMPattern {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+private:
+  LogicalResult
+  matchAndRewrite(LLVM::FDivOp op, LLVM::FDivOp::Adaptor adaptor,
+  ConversionPatternRewriter ) const override {
+if (!op.getType().isF16())
+  return rewriter.notifyMatchFailure(op, "not f16");
+Location loc = op.getLoc();
+
+Type f32Type = rewriter.getF32Type();
+Type i32Type = rewriter.getI32Type();
+
+// Extend lhs and rhs to fp32.
+Value lhs = rewriter.create(loc, f32Type, adaptor.getLhs());
+Value rhs = rewriter.create(loc, f32Type, adaptor.getRhs());
+
+// float rcp = rcp.approx.ftz.f32(rhs), approx = lhs * rcp.
+Value rcp = rewriter.create(loc, f32Type, rhs);
+Value approx = rewriter.create(loc, lhs, rcp);
+
+// Refine the approximation with one Newton iteration:
+// float refined = approx + (lhs - approx * rhs) * rcp;
+Value err = rewriter.create(
+loc, approx, rewriter.create(loc, rhs), lhs);
+Value refined = rewriter.create(loc, err, rcp, 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-30 Thread Christian Sigg via Phabricator via cfe-commits
csigg added inline comments.



Comment at: mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp:158
+// by the same divisor.
+struct ExpandDivF16 : public ConvertOpToLLVMPattern {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;

herhut wrote:
> This pattern is a bit misplaced here, as `LLVM::FDivOp` is not really a GPU 
> dialect operation. Instead, should this be a special lowering of the arith 
> dialect to NVVM (which we do not have yet) or a rewrite at the LLVM dialect 
> level?
> 
> When lowering to LLVM, we already typically configure a different lowering 
> for math dialect, so configuring the lowering of arith dialect differently 
> seems like an OK option. That would mean a specialized pattern for 
> `arith.divf` with higher priority. That would also give users a choice.
Yes, I agree it's a bit misplaced. I considered it the best of all questionable 
options.

Adding it to ArithToLLVM doesn't really work, because we don't want it to 
depend on the NVVM dialect.

How about adding it as a separate pass to 
`mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td`?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-25 Thread Christian Sigg via Phabricator via cfe-commits
csigg updated this revision to Diff 432074.
csigg added a comment.

Rebase.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

Files:
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
  mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir

Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
+// CHECK-LABEL: @nvvm_special_regs
 llvm.func @nvvm_special_regs() -> i32 {
   // CHECK: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %1 = nvvm.read.ptx.sreg.tid.x : i32
@@ -32,12 +33,21 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+  llvm.return %1 : f32
+}
+
+// CHECK-LABEL: @llvm_nvvm_barrier0
 llvm.func @llvm_nvvm_barrier0() {
   // CHECK: call void @llvm.nvvm.barrier0()
   nvvm.barrier0
   llvm.return
 }
 
+// CHECK-LABEL: @nvvm_shfl
 llvm.func @nvvm_shfl(
 %0 : i32, %1 : i32, %2 : i32,
 %3 : i32, %4 : f32) -> i32 {
@@ -60,6 +70,7 @@
   llvm.return %6 : i32
 }
 
+// CHECK-LABEL: @nvvm_shfl_pred
 llvm.func @nvvm_shfl_pred(
 %0 : i32, %1 : i32, %2 : i32,
 %3 : i32, %4 : f32) -> !llvm.struct<(i32, i1)> {
@@ -82,6 +93,7 @@
   llvm.return %6 : !llvm.struct<(i32, i1)>
 }
 
+// CHECK-LABEL: @nvvm_vote
 llvm.func @nvvm_vote(%0 : i32, %1 : i1) -> i32 {
   // CHECK: call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}})
   %3 = nvvm.vote.ballot.sync %0, %1 : i32
@@ -99,6 +111,7 @@
   llvm.return %0 : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
 }
 
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f16
 llvm.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
@@ -111,6 +124,7 @@
 }
 
 // f32 return type, f16 accumulate type
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
 llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
@@ -123,6 +137,7 @@
 }
 
 // f16 return type, f32 accumulate type
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
 llvm.func @nvvm_mma_m16n8k16_f16_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
@@ -135,6 +150,7 @@
 }
 
 // f32 return type, f32 accumulate type
+// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
 llvm.func @nvvm_mma_m16n8k16_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
 %b0 : vector<2xf16>, %b1 : vector<2xf16>,
@@ -146,7 +162,8 @@
   llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
 }
 
-llvm.func @nvvm_mma_m16n8k16_s8_s8(%a0 : i32, %a1 : i32,
+// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_s8
+llvm.func @nvvm_mma_m16n8k16_s8_s8(%a0 : i32, %a1 : i32,
 %b0 : i32, 
 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.s8
@@ -158,7 +175,8 @@
   llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
 }
 
-llvm.func @nvvm_mma_m16n8k16_s8_u8(%a0 : i32, %a1 : i32,
+// CHECK-LABEL: @nvvm_mma_m16n8k16_s8_u8
+llvm.func @nvvm_mma_m16n8k16_s8_u8(%a0 : i32, %a1 : i32,
 %b0 : i32, 
 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.m16n8k16.row.col.satfinite.s8.u8
@@ -170,7 +188,8 @@
   llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
 }
 
-llvm.func @nvvm_mma_m16n8k128_b1_b1(%a0 : i32, %a1 : i32, 
+// CHECK-LABEL: @nvvm_mma_m16n8k128_b1_b1
+llvm.func @nvvm_mma_m16n8k128_b1_b1(%a0 : i32, %a1 : i32,
 %b0 : i32,
 %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32,i32,i32,i32)> {  
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.mma.xor.popc.m16n8k128.row.col.b1
@@ -181,6 +200,7 @@
   llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
 }
 
+// CHECK-LABEL: @nvvm_mma_m16n8k32_s4_s4
 llvm.func @nvvm_mma_m16n8k32_s4_s4(%a0 : i32, %a1 : i32,
%b0 : i32,
 

[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-25 Thread Stephan Herhut via Phabricator via cfe-commits
herhut added inline comments.



Comment at: mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp:158
+// by the same divisor.
+struct ExpandDivF16 : public ConvertOpToLLVMPattern {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;

This pattern is a bit misplaced here, as `LLVM::FDivOp` is not really a GPU 
dialect operation. Instead, should this be a special lowering of the arith 
dialect to NVVM (which we do not have yet) or a rewrite at the LLVM dialect 
level?

When lowering to LLVM, we already typically configure a different lowering for 
math dialect, so configuring the lowering of arith dialect differently seems 
like an OK option. That would mean a specialized pattern for `arith.divf` with 
higher priority. That would also give users a choice.



Comment at: mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp:304
 
+  patterns.add(converter);
+

I assume this is to differentiate this pattern somehow but there is no need for 
an extra `patterns.add` here.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-25 Thread Christian Sigg via Phabricator via cfe-commits
csigg added a comment.

In D126158#3534750 , @tra wrote:

> I would suggest separating it into separate LLVM and MLIR patches.

Thanks Artem. I separated out the LLVM changes in 
https://reviews.llvm.org/D126369.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-24 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

I would suggest separating it into separate LLVM and MLIR patches.

LLVM changes look OK to me. No idea about MLIR. we would probably want to lower 
fp16 fdiv the same way in LLVM, too, but that would also have to be a separate 
patch.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126158/new/

https://reviews.llvm.org/D126158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D126158: [MLIR][GPU] Replace fdiv on fp16 with promoted (fp32) multiplication with reciprocal plus one (conditional) Newton iteration.

2022-05-22 Thread Christian Sigg via Phabricator via cfe-commits
csigg created this revision.
csigg added a reviewer: bkramer.
Herald added subscribers: bzcheeseman, mattd, gchakrabarti, awarzynski, 
sdasgup3, asavonic, wenzhicui, wrengr, Chia-hungDuan, dcaballe, cota, teijeong, 
rdzhabarov, tatianashp, msifontes, jurahul, Kayjukh, grosul1, Joonsoo, 
liufengdb, aartbik, mgester, arpith-jacob, antiagainst, shauheen, rriddle, 
mehdi_amini, sanjoy.google, hiraditya, jholewinski.
Herald added a reviewer: ftynse.
Herald added a reviewer: bondhugula.
Herald added a reviewer: ThomasRaoux.
Herald added a project: All.
csigg requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, stephenneuendorffer, 
nicolasvasilache, jdoerfert.
Herald added a reviewer: herhut.
Herald added projects: clang, MLIR, LLVM.

This is correct for all values, i.e. the same as promoting the division to fp32 
in the NVPTX backend. But it is faster (~10% in average, sometimes more) 
because:

- it performs less Newton iterations
- it avoids the slow path for e.g. denormals
- it allows reuse of the reciprocal for multiple divisions by the same divisor

Test program:

  #include 
  #include "cuda_fp16.h"
  
  // This is a variant of CUDA's own __hdiv which is fast than hdiv_promote 
below
  // and doesn't suffer from the perf cliff of div.rn.fp32 with 'special' 
values.
  __device__ half hdiv_newton(half a, half b) {
float fa = __half2float(a);
float fb = __half2float(b);
  
float rcp;
asm("{rcp.approx.ftz.f32 %0, %1;\n}" : "=f"(rcp) : "f"(fb));
  
float result = fa * rcp;
auto exponent = reinterpret_cast(result) & 0x7f80;
if (exponent != 0 && exponent != 0x7f80) {
  float err = __fmaf_rn(-fb, result, fa);
  result = __fmaf_rn(rcp, err, result);
}
  
return __float2half(result);
  }
  
  // Surprisingly, this is faster than CUDA's own __hdiv.
  __device__ half hdiv_promote(half a, half b) {
return __float2half(__half2float(a) / __half2float(b));
  }
  
  // This is an approximation that is accurate up to 1 ulp.
  __device__ half hdiv_approx(half a, half b) {
float fa = __half2float(a);
float fb = __half2float(b);
  
float result;
asm("{div.approx.ftz.f32 %0, %1, %2;\n}" : "=f"(result) : "f"(fa), "f"(fb));
return __float2half(result);
  }
  
  __global__ void CheckCorrectness() {
int i = threadIdx.x + blockIdx.x * blockDim.x;
half x = reinterpret_cast(i);
for (int j = 0; j < 65536; ++j) {
  half y = reinterpret_cast(j);
  half d1 = hdiv_newton(x, y);
  half d2 = hdiv_promote(x, y);
  auto s1 = reinterpret_cast(d1);
  auto s2 = reinterpret_cast(d2);
  if (s1 != s2) {
printf("%f (%u) / %f (%u), got %f (%hu), expected: %f (%hu)\n",
   __half2float(x), i, __half2float(y), j, __half2float(d1), s1,
   __half2float(d2), s2);
//__trap();
  }
}
  }
  
  __device__ half dst;
  
  __global__ void ProfileBuiltin(half x) {
#pragma unroll 1
for (int i = 0; i < 1000; ++i) {
  x = x / x;
}
dst = x;
  }
  
  __global__ void ProfilePromote(half x) {
#pragma unroll 1
for (int i = 0; i < 1000; ++i) {
  x = hdiv_promote(x, x);
}
dst = x;
  }
  
  __global__ void ProfileNewton(half x) {
#pragma unroll 1
for (int i = 0; i < 1000; ++i) {
  x = hdiv_newton(x, x);
}
dst = x;
  }
  
  __global__ void ProfileApprox(half x) {
#pragma unroll 1
for (int i = 0; i < 1000; ++i) {
  x = hdiv_approx(x, x);
}
dst = x;
  }
  
  int main() {
CheckCorrectness<<<256, 256>>>();
half one = __float2half(1.0f);
ProfileBuiltin<<<1, 1>>>(one);  // 1.001s
ProfilePromote<<<1, 1>>>(one);  // 0.560s
ProfileNewton<<<1, 1>>>(one);   // 0.508s
ProfileApprox<<<1, 1>>>(one);   // 0.304s
auto status = cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(status));
  }


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D126158

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
  mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
  mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
  mlir/test/Dialect/LLVMIR/nvvm.mlir
  mlir/test/Target/LLVMIR/nvvmir.mlir

Index: mlir/test/Target/LLVMIR/nvvmir.mlir
===
--- mlir/test/Target/LLVMIR/nvvmir.mlir
+++ mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
+// CHECK-LABEL: @nvvm_special_regs
 llvm.func @nvvm_special_regs() -> i32 {
   // CHECK: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %1 = nvvm.read.ptx.sreg.tid.x : i32
@@ -32,12 +33,21 @@
   llvm.return %1 : i32
 }
 
+// CHECK-LABEL: @nvvm_rcp
+llvm.func @nvvm_rcp(%0: f32) -> f32 {
+  // CHECK: call float @llvm.nvvm.rcp.approx.ftz.f
+  %1 = nvvm.rcp.approx.ftz.f %0 : f32
+