[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Anshil Gandhi via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG39dac1f7f656: [clang] Add clang builtins support for gfx90a 
(authored by gandhi21299).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx7-err.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_add_f64$local:
+// GFX90A:  global_atomic_add_f64
+void test_global_add_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL:  test_global_add_half2
+// GFX90A:  global_atomic_pk_add_f16 v2, v[0:1], v2, off glc
+void test_global_add_half2(__global half2 *addr, half2 x) {
+  half2 *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_global_min_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min_f64$local
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_max_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_max_f64$local
+// GFX90A:  global_atomic_max_f64
+void test_global_max_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_local_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_add_local_f64$local
+// GFX90A:  ds_add_rtn_f64
+void test_flat_add_local_f64(__local double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_add_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_add_f64$local
+// GFX90A:  global_atomic_add_f64
+void test_flat_global_add_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_min_flat_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_flat_f64$local
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_flat_f64(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_min_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min_f64$local
+// GFX90A:  global_atomic_min_f64
+void test_flat_global_min_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_max_flat_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_max_flat_f64$local
+// GFX90A:  flat_atomic_max_f64
+void test_flat_max_flat_f64(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_max_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_max_f64$local
+// GFX90A:  global_atomic_max_f64
+void test_flat_global_max_f64(__global d

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added a comment.

I will merge this patch in as soon as the builds are successful.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added a comment.

Thanks a lot for the review, I will merge this patch in :)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec accepted this revision.
rampitec added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 364266.
gandhi21299 added a comment.

- combined tests into a single file
- renamed tests for consistency


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx7-err.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx908-err.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_add_f64$local:
+// GFX90A:  global_atomic_add_f64
+void test_global_add_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL:  test_global_add_half2
+// GFX90A:  global_atomic_pk_add_f16 v2, v[0:1], v2, off glc
+void test_global_add_half2(__global half2 *addr, half2 x) {
+  half2 *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_global_min_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min_f64$local
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_max_f64
+// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_max_f64$local
+// GFX90A:  global_atomic_max_f64
+void test_global_max_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_local_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_add_local_f64$local
+// GFX90A:  ds_add_rtn_f64
+void test_flat_add_local_f64(__local double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_add_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_add_f64$local
+// GFX90A:  global_atomic_add_f64
+void test_flat_global_add_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_min_flat_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_flat_f64$local
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_flat_f64(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_min_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min_f64$local
+// GFX90A:  global_atomic_min_f64
+void test_flat_global_min_f64(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_max_flat_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_max_flat_f64$local
+// GFX90A:  flat_atomic_max_f64
+void test_flat_max_flat_f64(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_max_f64
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_max_f64$local
+// GFX90A:  global_atomic_max_f64
+void test_flat_global_max_f64(__global double *addr, double x){
+  doubl

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx7.cl:8
+}
\ No newline at end of file


Add new line.



Comment at: clang/test/CodeGenOpenCL/unsupported-fadd2f16-gfx908.cl:1
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu 
gfx908 \

Combine all of these gfx908 error tests into a single file. For example like in 
the builtins-amdgcn-dl-insts-err.cl. It is also better to rename these test 
filenames to follow the existing pattern: builtins-amdgcn-*-err.cl


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-05 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 364242.
gandhi21299 added a comment.

- added more negative tests
- fixed some tests


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx7.cl
  clang/test/CodeGenOpenCL/unsupported-fadd2f16-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl
  clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl

Index: clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_global_min_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fmin_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_flat_min_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fmin_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_global_max_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fmax_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_flat_max_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fmax_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_global_add_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_flat_add_f64(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %c

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", 
"gfx90a-insts")

gandhi21299 wrote:
> I tried add target feature gfx908-insts for this builtin but the frontend 
> complains that it should have target feature gfx90a-insts.
That was for global_atomic_fadd_f32, but as per discussion we are going to use 
builtin only starting from gfx90a because of the noret problem. Comments in the 
review are off their positions after multiple patch updates.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:210
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts")
+

This needs tests with a gfx8 target and a negative test with gfx7.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl:13
+// GFX90A:  global_atomic_add_f64
+void test_global_add(__global double *addr, double x) {
+  double *rtn;

Use _f64 or _double in the test name.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl:31
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min(__global double *addr, double x){
+  double *rtn;

Same here and in other double tests, use a suffix f64 or double.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl:67
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_constant(__generic double *addr, double x){
+  double *rtn;

'constant' is wrong. It is flat. Here and everywhere.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl:7
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // 
expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature 
gfx90a-insts}}
+}

Need to check all other buintins too.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", 
"gfx90a-insts")

I tried add target feature gfx908-insts for this builtin but the frontend 
complains that it should have target feature gfx90a-insts.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 364152.
gandhi21299 added a comment.

updating test


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
  clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \
+// RUN:   -verify -S -o - %s
+
+void test_global_add(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature gfx90a-insts}}
+}
Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_add$local:
+// GFX90A:  global_atomic_add_f64
+void test_global_add(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL:  test_global_add_half2
+// GFX90A:  global_atomic_pk_add_f16 v2, v[0:1], v2, off glc
+void test_global_add_half2(__global half2 *addr, half2 x) {
+  half2 *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_max$local
+// GFX90A:  global_atomic_max_f64
+void test_global_max(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_add_local$local
+// GFX90A:  ds_add_rtn_f64
+void test_flat_add_local(__local double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_add$local
+// GFX90A:  global_atomic_add_f64
+void test_flat_global_add(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_constant$local
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_constant(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_flat_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x);
+}
+
+// CHECK-LABEL: test_flat_max_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_max_constant$local
+// GFX90A:  flat_atomic_max_f64
+void test_flat_max_constant(__generic double *addr, double x){
+  dou

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16270
+llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
+  }

gandhi21299 wrote:
> rampitec wrote:
> > gandhi21299 wrote:
> > > rampitec wrote:
> > > > Should we map flags since we already have them?
> > > Do you mean the memory order flag?
> > All 3: ordering, scope and volatile.
> Following the discussion, what change is required here?
Keep zeroes, drop immediate argument of the builtins.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16270
+llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
+  }

rampitec wrote:
> gandhi21299 wrote:
> > rampitec wrote:
> > > Should we map flags since we already have them?
> > Do you mean the memory order flag?
> All 3: ordering, scope and volatile.
Following the discussion, what change is required here?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added a comment.

In D106909#2923724 , @gandhi21299 
wrote:

> @rampitec what should I be testing exactly in the IR test?

Produced call to the intrinsic. All of these tests there doing that.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-04 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added a comment.

@rampitec what should I be testing exactly in the IR test?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added a comment.

In D106909#2922567 , @gandhi21299 
wrote:

> @rampitec how do I handle the following?
>
>   builtins-fp-atomics.cl:38:10: error: 
> '__builtin_amdgcn_global_atomic_fadd_f64' needs target feature 
> atomic-fadd-insts
> *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x, 
> memory_order_relaxed);
>^

It is f64, it needs gfx90a-insts. atomic-fadd-insts is for global f32.




Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16270
+llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
+  }

gandhi21299 wrote:
> rampitec wrote:
> > Should we map flags since we already have them?
> Do you mean the memory order flag?
All 3: ordering, scope and volatile.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added a comment.

@rampitec how do I handle the following?

  builtins-fp-atomics.cl:38:10: error: 
'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature atomic-fadd-insts
*rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x, 
memory_order_relaxed);
   ^


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 marked 5 inline comments as done.
gandhi21299 added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16270
+llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
+  }

rampitec wrote:
> Should we map flags since we already have them?
Do you mean the memory order flag?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16270
+llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
+  }

Should we map flags since we already have them?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

arsenm wrote:
> rampitec wrote:
> > arsenm wrote:
> > > rampitec wrote:
> > > > gandhi21299 wrote:
> > > > > rampitec wrote:
> > > > > > You do not need any of that code. You can directly map a builtin to 
> > > > > > intrinsic in the IntrinsicsAMDGPU.td.
> > > > > Sorry, I looked around for several days but I could not figure this 
> > > > > out. Is there a concrete example?
> > > > Every instantiation of `GCCBuiltin` in the `IntrinsicsAMDGPU.td`.
> > > This is not true if the intrinsic requires type mangling. GCCBuiltin is 
> > > too simple to handle it
> > Yes, but these do not need it. All of these builtins are specific.
> These intrinsics are all mangled based on the FP type
Ah, right. Intrinsics are mangled, builtins not. True. OK, this shall be code 
then.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

rampitec wrote:
> arsenm wrote:
> > rampitec wrote:
> > > gandhi21299 wrote:
> > > > rampitec wrote:
> > > > > You do not need any of that code. You can directly map a builtin to 
> > > > > intrinsic in the IntrinsicsAMDGPU.td.
> > > > Sorry, I looked around for several days but I could not figure this 
> > > > out. Is there a concrete example?
> > > Every instantiation of `GCCBuiltin` in the `IntrinsicsAMDGPU.td`.
> > This is not true if the intrinsic requires type mangling. GCCBuiltin is too 
> > simple to handle it
> Yes, but these do not need it. All of these builtins are specific.
These intrinsics are all mangled based on the FP type


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

arsenm wrote:
> rampitec wrote:
> > gandhi21299 wrote:
> > > rampitec wrote:
> > > > You do not need any of that code. You can directly map a builtin to 
> > > > intrinsic in the IntrinsicsAMDGPU.td.
> > > Sorry, I looked around for several days but I could not figure this out. 
> > > Is there a concrete example?
> > Every instantiation of `GCCBuiltin` in the `IntrinsicsAMDGPU.td`.
> This is not true if the intrinsic requires type mangling. GCCBuiltin is too 
> simple to handle it
Yes, but these do not need it. All of these builtins are specific.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

rampitec wrote:
> gandhi21299 wrote:
> > rampitec wrote:
> > > You do not need any of that code. You can directly map a builtin to 
> > > intrinsic in the IntrinsicsAMDGPU.td.
> > Sorry, I looked around for several days but I could not figure this out. Is 
> > there a concrete example?
> Every instantiation of `GCCBuiltin` in the `IntrinsicsAMDGPU.td`.
This is not true if the intrinsic requires type mangling. GCCBuiltin is too 
simple to handle it


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

gandhi21299 wrote:
> rampitec wrote:
> > You do not need any of that code. You can directly map a builtin to 
> > intrinsic in the IntrinsicsAMDGPU.td.
> Sorry, I looked around for several days but I could not figure this out. Is 
> there a concrete example?
Every instantiation of `GCCBuiltin` in the `IntrinsicsAMDGPU.td`.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 marked 3 inline comments as done.
gandhi21299 added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

rampitec wrote:
> You do not need any of that code. You can directly map a builtin to intrinsic 
> in the IntrinsicsAMDGPU.td.
Sorry, I looked around for several days but I could not figure this out. Is 
there a concrete example?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-08-03 Thread Stanislav Mekhanoshin via Phabricator via cfe-commits
rampitec requested changes to this revision.
rampitec added a comment.
This revision now requires changes to proceed.

Needs an IR test, a test for different supported targets, and a negative test 
for unsupported features.




Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:199
 
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1di", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1fi", "t", 
"gfx90a-insts")

Correct attribute for this one in atomic-fadd-insts. In particular it was first 
added in gfx908 and you would need to test it too.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:205
+
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f64, "dd*1di", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmin_f64, "dd*1di", "t", 
"gfx90a-insts")

Flat address space is 0.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:210
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3di", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3fi", "t", 
"gfx90a-insts")
+

This is available since gfx8. Attribute gfx8-insts.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16212
+  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: {
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());

You do not need any of that code. You can directly map a builtin to intrinsic 
in the IntrinsicsAMDGPU.td.



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl:112
+kernel void test_flat_global_max(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}

arsenm wrote:
> gandhi21299 wrote:
> > arsenm wrote:
> > > If you're going to bother testing the ISA, is it worth testing rtn and no 
> > > rtn versions?
> > Sorry, what do you mean by rtn version?
> Most atomics can be optimized if they don't return the in memory value if the 
> value is unused
Certainly yes, because global_atomic_add_f32 did not have return version on 
gfx908.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-31 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 363196.
gandhi21299 added a comment.

refreshing patch


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_add$local:
+// GFX90A:  global_atomic_add_f64
+void test_global_add(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL:  test_global_addf$local:
+// GFX90A: global_atomic_add_f32
+void test_global_addf(__global float *addr, float x) {
+  float *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL:  test_global_add2h$local
+// GFX90A: global_atomic_pk_add_f16
+void test_global_add2h(__global half2 *addr, half2 x, __global half2 *out){
+  *out = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_max$local
+// GFX90A:  global_atomic_max_f64
+void test_global_max(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_add_local$local
+// GFX90A:  ds_add_rtn_f64
+void test_flat_add_local(__local double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_add$local
+// GFX90A:  global_atomic_add_f64
+void test_flat_global_add(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_constant$local
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_constant(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_flat_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_max_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_max_constant$local
+// GFX90A:  flat_atomic_max_f64
+void test_flat_max_constant(__generic double *

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-31 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 363159.
gandhi21299 marked an inline comment as done.
gandhi21299 added a comment.

addressed reviewers' feedback:

- changed a builtin name,
- corrected tests,
- minor formatting nits


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_add$local:
+// GFX90A:  global_atomic_add_f64
+void test_global_add(__global double *addr, double x) {
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL:  test_global_addf$local:
+// GFX90A: global_atomic_add_f32
+void test_global_addf(__global float *addr, float x) {
+  float *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL:  test_global_add2h$local
+// GFX90A: global_atomic_pk_add_f16
+void test_global_add2h(__global half2 *addr, half2 x, __global half2 *out){
+  *out = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_global_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_max$local
+// GFX90A:  global_atomic_max_f64
+void test_global_max(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_add_local$local
+// GFX90A:  ds_add_rtn_f64
+void test_flat_add_local(__local double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_global_add$local
+// GFX90A:  global_atomic_add_f64
+void test_flat_global_add(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_constant$local
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_constant(__generic double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min$local
+// GFX90A:  global_atomic_min_f64
+void test_flat_global_min(__global double *addr, double x){
+  double *rtn;
+  *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_max_constant
+// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}}

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 marked 7 inline comments as done.
gandhi21299 added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1fi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_2f16, "hh*1hi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1di", "t", 
"gfx90a-insts")

arsenm wrote:
> yaxunl wrote:
> > arsenm wrote:
> > > "_2f16" looks weird to me. The instruction names call it "pk"
> > This is to have a consistent postfix naming convention, since the stem part 
> > here are the same. the postfix is for the argument type of the builtin 
> > function.
> Just a plain 2 isn't consistent either. The llvm type naming convention would 
> add a v prefix, but the builtins should probably follow the instructions
Yea, v2f16 looks reasonable.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1fi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_2f16, "hh*1hi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1di", "t", 
"gfx90a-insts")

yaxunl wrote:
> arsenm wrote:
> > "_2f16" looks weird to me. The instruction names call it "pk"
> This is to have a consistent postfix naming convention, since the stem part 
> here are the same. the postfix is for the argument type of the builtin 
> function.
Just a plain 2 isn't consistent either. The llvm type naming convention would 
add a v prefix, but the builtins should probably follow the instructions



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16114
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
+switch (BuiltinID) {

gandhi21299 wrote:
> arsenm wrote:
> > Initializing this here is strange, sink down to the double case
> You mean push it down after line 16119?
Yes



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl:112
+kernel void test_flat_global_max(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}

gandhi21299 wrote:
> arsenm wrote:
> > If you're going to bother testing the ISA, is it worth testing rtn and no 
> > rtn versions?
> Sorry, what do you mean by rtn version?
Most atomics can be optimized if they don't return the in memory value if the 
value is unused


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16114
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
+switch (BuiltinID) {

arsenm wrote:
> Initializing this here is strange, sink down to the double case
You mean push it down after line 16119?



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl:112
+kernel void test_flat_global_max(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}

arsenm wrote:
> If you're going to bother testing the ISA, is it worth testing rtn and no rtn 
> versions?
Sorry, what do you mean by rtn version?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1fi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_2f16, "hh*1hi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1di", "t", 
"gfx90a-insts")

arsenm wrote:
> "_2f16" looks weird to me. The instruction names call it "pk"
This is to have a consistent postfix naming convention, since the stem part 
here are the same. the postfix is for the argument type of the builtin function.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

Missing sema test for rejecting the builtins on targets that don't support this




Comment at: clang/include/clang/Basic/BuiltinsAMDGPU.def:201
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1fi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_2f16, "hh*1hi", "t", 
"gfx90a-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1di", "t", 
"gfx90a-insts")

"_2f16" looks weird to me. The instruction names call it "pk"



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:16114
+Intrinsic::ID IID;
+llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
+switch (BuiltinID) {

Initializing this here is strange, sink down to the double case



Comment at: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl:112
+kernel void test_flat_global_max(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}

If you're going to bother testing the ISA, is it worth testing rtn and no rtn 
versions?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-30 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 added a comment.

Passed PSDB


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-29 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 362804.
gandhi21299 added a comment.

fixed tests by replacing checks for the functions with generic-qualified 
arguments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,131 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_add(__global double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_addf(__global float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_add2h
+// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_add2h(__global half2 *addr, half2 x){
+  __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min
+// GFX90A:  global_atomic_min_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_global_min(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_max
+// GFX90A:  global_atomic_max_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_max(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_add_local
+// GFX90A:  ds_add_f64 v2, v[0:1]
+// GFX90A:  s_endpgm
+kernel void test_flat_add_local(__local double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_add
+// GFX90A:  global_atomic_add_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_add(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_flat_min_constant
+// GFX90A:  flat_atomic_min_f64
+void test_flat_min_constant(__generic double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min
+// GFX90A:  global_atomic_min_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_min(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_max_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, doub

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-29 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 362555.
gandhi21299 added a comment.

- eliminated scope argument, seemed irrelevant


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_add(__global double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_addf(__global float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_add2h
+// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_add2h(__global half2 *addr, half2 x){
+  __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min
+// GFX90A:  global_atomic_min_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_global_min(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_max
+// GFX90A:  global_atomic_max_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_max(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_add_local
+// GFX90A:  ds_add_f64 v2, v[0:1]
+// GFX90A:  s_endpgm
+kernel void test_flat_add_local(__local double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_add
+// GFX90A:  global_atomic_add_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_add(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_min_constant
+// GFX90A:  global_atomic_min_f64
+// GFX90A:  s_endpgm
+void test_flat_min_constant(__generic double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min
+// GFX90A:  global_atomic_min_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_min(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_max_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmax.f64.p4f64.f64(double addrspace(4)* %{{.*}}

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-29 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 362522.
gandhi21299 added a comment.
Herald added subscribers: llvm-commits, foad, hiraditya.
Herald added a project: LLVM.

- removed `kernel` from functions taking in `__generic` qualified `addr`


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/test/CodeGenCUDA/fp-atomics-optremarks.cu
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
  clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl
  llvm/lib/Target/AMDGPU/SIISelLowering.cpp
  llvm/lib/Target/AMDGPU/SIISelLowering.h

Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -30,6 +30,7 @@
 class SITargetLowering final : public AMDGPUTargetLowering {
 private:
   const GCNSubtarget *Subtarget;
+  OptimizationRemarkEmitter *ORE;
 
 public:
   MVT getRegisterTypeForCallingConv(LLVMContext &Context,
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -12117,6 +12118,27 @@
   return DenormMode == DenormalMode::getIEEE();
 }
 
+static TargetLowering::AtomicExpansionKind
+atomicExpandReturn(OptimizationRemarkEmitter *ORE, AtomicRMWInst *RMW,
+   TargetLowering::AtomicExpansionKind Kind, bool UnsafeFlag) {
+  ORE = new OptimizationRemarkEmitter(RMW->getFunction());
+  if (Kind == TargetLowering::AtomicExpansionKind::CmpXChg) {
+ORE->emit([&]() {
+  OptimizationRemark Remark(DEBUG_TYPE, "Passed", RMW->getFunction());
+  Remark << "An FP atomic instruction was expanded into a CAS loop.";
+  return Remark;
+});
+  } else if (Kind == TargetLowering::AtomicExpansionKind::None && UnsafeFlag) {
+ORE->emit([&]() {
+  OptimizationRemark Remark(DEBUG_TYPE, "Passed", RMW->getFunction());
+  Remark << "An unsafe hardware instruction was generated.";
+  return Remark;
+});
+  }
+  delete ORE;
+  return Kind;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   switch (RMW->getOperation()) {
@@ -12132,35 +12154,43 @@
   return AtomicExpansionKind::CmpXChg;
 
 unsigned AS = RMW->getPointerAddressSpace();
-
+bool UnsafeFPAtomicFlag = RMW->getFunction()
+  ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+  .getValueAsBool();
 if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
  Subtarget->hasAtomicFaddInsts()) {
   // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
   // floating point atomic instructions. May generate more efficient code,
   // but may not respect rounding and denormal modes, and may give incorrect
   // results for certain memory destinations.
-  if (RMW->getFunction()
-  ->getFnAttribute("amdgpu-unsafe-fp-atomics")
-  .getValueAsString() != "true")
-return AtomicExpansionKind::CmpXChg;
+  if (!UnsafeFPAtomicFlag)
+return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg,
+  UnsafeFPAtomicFlag);
+  atomicExpandReturn(ORE, RMW, AtomicExpansionKind::None, 
+UnsafeFPAtomicFlag);
 
   if (Subtarget->hasGFX90AInsts()) {
 if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)
-  return AtomicExpansionKind::CmpXChg;
+  return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg,
+UnsafeFPAtomicFlag);
 
 auto SSID = RMW->getSyncScopeID();
 if (SSID == SyncScope::System ||
 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
-  return AtomicExpansionKind::CmpXChg;
+  return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg,
+UnsafeFPAtomicFlag);
 
-return AtomicExpansionKind::None;
+return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::None,
+  UnsafeFPAtomicFlag);
   }
 
   if (AS == AMDGPUAS::FLAT_ADDRESS)
-return AtomicExpansionKind::CmpXChg;
+return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg,
+  UnsafeFPAtomicFlag);
 
-  return RMW->use_empty() ? AtomicExpansionKind::None
-  : AtomicExpansionKind::CmpXChg;
+  auto Kind = RMW->use_emp

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-29 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 362487.
gandhi21299 added a comment.

- added scope argument
- removed irrelevant tests and updated them
- replaced __constant by __generic qualifier for two of the test functions


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_add(__global double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_addf(__global float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_add2h
+// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_add2h(__global half2 *addr, half2 x){
+  __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_global_global_min
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A-LABEL:  test_global_global_min
+// GFX90A:  global_atomic_min_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_global_min(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_max
+// GFX90A:  global_atomic_max_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_max(__global double *addr, double x){
+  __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_add_local
+// GFX90A:  ds_add_f64 v2, v[0:1]
+// GFX90A:  s_endpgm
+kernel void test_flat_add_local(__local double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_flat_global_add
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_add
+// GFX90A:  global_atomic_add_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_add(__global double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_min_constant
+// GFX90A:  global_atomic_min_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_min_constant(__generic double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed, "workgroup");
+}
+
+// CHECK-LABEL: test_flat_global_min
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_global_min
+// GFX90A:  global_atomic_min_f64
+// GFX90A:  s_endpgm
+kernel void test_flat_global_min(__global double *addr, double x){
+  __builtin

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-28 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 updated this revision to Diff 362433.
gandhi21299 added a comment.

- added more tests with supported combinations of address spaces
- minor nits


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106909/new/

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,210 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_add(__global double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_constant_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_constant_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_constant_add(__constant double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_local_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_local_add
+// GFX90A:  ds_add_f64
+// GFX90A:  s_endpgm
+kernel void test_global_local_add(__local double *addr, double x) {
+  __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_addf(__global float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_constant_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p4f32.f32(float addrspace(4)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_constant_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_constant_addf(__constant float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_local_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p3f32.f32(float addrspace(3)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_local_addf
+// GFX90A: ds_add_f32
+// GFX90A: s_endpgm
+kernel void test_global_local_addf(__local float *addr, float x) {
+  __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_add2h
+// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_add2h(__global half2 *addr, half2 x){
+  __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_constant_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p4v2f16.v2f16(<2 x half> addrspace(4)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_constant_add2h
+// GFX90A: global_atomic_pk_add_f16
+kernel void test_global_constant_add2h(__constant half2 *addr, half2 x){
+  __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed);
+}
+
+// CaHECK-LABEL: test_global_local_add2h
+// CHaECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p3v2f16.v2f16(<2 x half> addrspace(3)* %{{.*}}, <2 x half> %{{.*}})
+// kernel void test_global_local_add2h(__local half2 *addr, half2 x){
+//   __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed); // expected-error{{fatal error: error in backend: Cannot select: t22: v2f16,ch = AtomicLoadFAdd<(volatile dereferencea

[PATCH] D106909: [clang] Add clang builtins support for gfx90a

2021-07-27 Thread Anshil Gandhi via Phabricator via cfe-commits
gandhi21299 created this revision.
gandhi21299 added reviewers: arsenm, yaxunl, rampitec.
Herald added subscribers: kerbowa, jfb, tpr, nhaehnle, jvesely.
gandhi21299 requested review of this revision.
Herald added subscribers: cfe-commits, wdng.
Herald added a project: clang.

Implement target builtins for gfx90a including fadd64, fadd32, add2h, max and 
min on various global, flat and ds address spaces for which intrinsics are 
already implemented.

@rampitec Compiler recommended me to add `global-noret` target feature after 
setting it in BuiltinsAMDGPU.def. I am not sure what that means outside of the 
BuiltinsAMDGPU.def so I have changed it back to `gfx90a-insts`.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D106909

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-fp-atomics.cl

Index: clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-fp-atomics.cl
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:   -S -o - %s | FileCheck -check-prefix=GFX90A %s
+
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+
+// CHECK-LABEL: test_global_add
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_add
+// GFX90A:  global_atomic_add_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_add(__global double *addr, double x) {
+  __builtin_amdgcn_atomic_fadd64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_addf
+// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}})
+// GFX90A-LABEL: test_global_addf
+// GFX90A: global_atomic_add_f32 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_addf(__global float *addr, float x) {
+  __builtin_amdgcn_atomic_fadd32(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_add2h
+// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}})
+// GFX90A-LABEL: test_global_add2h
+// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1]
+// GFX90A: s_endpgm
+kernel void test_global_add2h(__global half2 *addr, half2 x){
+  __builtin_amdgcn_atomic_add2h(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_min
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_min
+// GFX90A:  global_atomic_min_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_min(__global double *addr, double x){
+  __builtin_amdgcn_atomic_min(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_global_max
+// CHECK: tail call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_global_max
+// GFX90A:  global_atomic_max_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_global_max(__global double *addr, double x){
+  __builtin_amdgcn_atomic_max(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_flat_add_local
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_add_local
+// GFX90A:  ds_add_f64 v2, v[0:1]
+// GFX90A:  s_endpgm
+kernel void test_flat_add_local(__local double *addr, double x){
+  __builtin_amdgcn_flat_atomic_fadd64(addr, x, memory_order_relaxed);
+}
+
+
+// CHECK-LABEL: test_flat_min_constant
+// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}})
+// GFX90A:  test_flat_min_constant
+// GFX90A:  global_atomic_min_f64 v2, v[0:1], s[0:1]
+// GFX90A:  s_endpgm
+kernel void test_flat_min_constant(__constant double *addr, double x){
+  __builtin_amdgcn_flat_atomic_min(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_ds_add_local
+// CHECK: tail call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %{{.*}}, double %{{.*}},
+// GFX90A:  test_ds_add_local
+// GFX90A:  ds_add_f64 v2, v[0:1]
+// GFX90A:  s_endpgm
+kernel void test_ds_add_local(__local double *addr, double x){
+  __builtin_amdgcn_ds_atomic_fadd64(addr, x, memory_order_relaxed);
+}
+
+// CHECK-LABEL: test_ds_addf_local
+// CHECK: tail call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}},
+// GFX90A