[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-29 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm closed this revision.
arsenm added a comment.

f4bcd7f598331457cfe74e459b489d4098369511


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-26 Thread Sameer Sahasrabuddhe via Phabricator via cfe-commits
sameerds added a comment.

And note that the change description is written in a first-person train of 
thought. Please do rewrite it!


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-26 Thread Sameer Sahasrabuddhe via Phabricator via cfe-commits
sameerds accepted this revision.
sameerds added inline comments.
This revision is now accepted and ready to land.



Comment at: clang/lib/Basic/Targets/AMDGPU.cpp:353
+  if (HaveWave32 && HaveWave64) {
+Diags.Report(diag::err_invalid_feature_combination)
+<< "'wavefrontsize32' and 'wavefrontsize64' are mutually exclusive";

I would have preferred this to be a separate change, just like the FIXME for 
diagnosing wavefrontsize32 on targets that don't support it. But not feeling 
strongly enough to block this change!


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-23 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm updated this revision to Diff 485110.
arsenm added a comment.

Fix unknown target handling, diagnose some more of the errors


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/Basic/Targets/AMDGPU.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/amdgpu-features-illegal.cl
  clang/test/CodeGenOpenCL/amdgpu-features.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
  clang/test/OpenMP/amdgcn-attributes.cpp
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl

Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-feature +wavefrontsize32 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize32 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s
+
+typedef unsigned long ulong;
+
+void test_ballot_wave64(global ulong* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w64(a == b);  // expected-error {{'__builtin_amdgcn_ballot_w64' needs target feature wavefrontsize64}}
+}
+
+__attribute__((target("wavefrontsize64")))
+void test_ballot_wave64_target_attr(global ulong* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w64(a == b);
+}
Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -target-feature +wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize32 -verify -S -o - %s
+
+typedef unsigned int uint;
+
+void test_ballot_wave32(global uint* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w32(a == b);  // expected-error {{'__builtin_amdgcn_ballot_w32' needs target feature wavefrontsize32}}
+}
+
+// FIXME: Should error for subtargets that don't support wave32
+__attribute__((target("wavefrontsize32")))
+void test_ballot_wave32_target_attr(global uint* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w32(a == b);
+}
Index: clang/test/OpenMP/amdgcn-attributes.cpp
===
--- clang/test/OpenMP/amdgcn-attributes.cpp
+++ clang/test/OpenMP/amdgcn-attributes.cpp
@@ -33,11 +33,11 @@
 }
 
 // DEFAULT: attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="none" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="none" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="none" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
 // NOIEEE: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-ieee"="false" "frame-pointer"="none" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // UNSAFEATOMIC: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-unsafe-fp-atomics"="true" "frame-pointer"="none" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 
 // DEFAULT: attributes #1 = { convergent mustprogress noinline nounwind optnone "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #1 = { convergent mustprogress noinline nounwind optnone "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" 

[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-23 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm planned changes to this revision.
arsenm added a comment.

This doesn't work correctly for unspecified wavesize for non-wave32 targets


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-23 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/test/CodeGenOpenCL/amdgpu-features.cl:7
+// RUN: %clang_cc1 -triple amdgcn -S -emit-llvm -o - %s | FileCheck 
--check-prefix=NOCPU %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s

sameerds wrote:
> yaxunl wrote:
> > what happens if both +wavefrontsize32 and +wavefrontsize64 are specified?
> Shouldn't this be separately an error in itself? Is it tested elsewhere?
It looks like you end up with both features set by clang, and wave64 wins in 
codegen


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-23 Thread Sameer Sahasrabuddhe via Phabricator via cfe-commits
sameerds added inline comments.



Comment at: clang/test/CodeGenOpenCL/amdgpu-features.cl:7
+// RUN: %clang_cc1 -triple amdgcn -S -emit-llvm -o - %s | FileCheck 
--check-prefix=NOCPU %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s

yaxunl wrote:
> what happens if both +wavefrontsize32 and +wavefrontsize64 are specified?
Shouldn't this be separately an error in itself? Is it tested elsewhere?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-12-22 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm updated this revision to Diff 484980.
arsenm added a comment.

Rebase. Use _w32/_w64 suffixes since some other wave specific builtins seem to 
have gone with that convention


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/Basic/Targets/AMDGPU.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/amdgpu-features.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl

Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize32 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s
+
+typedef unsigned long ulong;
+
+void test_ballot_wave64(global ulong* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w64(a == b);  // expected-error {{'__builtin_amdgcn_ballot_w64' needs target feature wavefrontsize64}}
+}
Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize32 -verify -S -o - %s
+
+typedef unsigned int uint;
+
+void test_ballot_wave32(global uint* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_w32(a == b);  // expected-error {{'__builtin_amdgcn_ballot_w32' needs target feature wavefrontsize32}}
+}
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+
+typedef unsigned long ulong;
+
+// CHECK-LABEL: @test_ballot_wave64(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 %{{.+}})
+void test_ballot_wave64(global ulong* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ballot_w64(a == b);
+}
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @test_ballot_wave32(
+// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
+void test_ballot_wave32(global uint* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ballot_w32(a == b);
+}
+
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
===
--- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -37,3 +37,10 @@
 {
   *out = __builtin_amdgcn_groupstaticsize();
 }
+
+// CHECK-LABEL: @test_ballot_wave32(
+// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
+void test_ballot_wave32(global uint* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ballot_w32(a == b);
+}
Index: clang/test/CodeGenOpenCL/amdgpu-features.cl
===
--- clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -3,6 +3,10 @@
 // Check that appropriate features are 

[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-09-20 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

In D82087#3803464 , @arsenm wrote:

> In D82087#3797883 , @jdoerfert wrote:
>
>> Can we land this? I'd like to use the new intrinsics as I don't understand 
>> the old ones.
>
> What do you think about using the two separate builtins, vs. one magic 
> builtin that auto-changes the wavesize?

The magic one would also change its return type, or always be i64 with high 
bits (zext or maybe sext or maybe copy of low), so less magic seems clearer


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-09-20 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D82087#3797883 , @jdoerfert wrote:

> Can we land this? I'd like to use the new intrinsics as I don't understand 
> the old ones.

What do you think about using the two separate builtins, vs. one magic builtin 
that auto-changes the wavesize?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2022-09-17 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added a comment.
Herald added a subscriber: kosarev.
Herald added a project: All.

Can we land this? I'd like to use the new intrinsics as I don't understand the 
old ones.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-26 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

A macro for wavefront size would make targeting gfx10 for openmp easier.

We currently use an int32_t for nvptx and an int64_t for amdgcn in various 
runtime function interfaces. I'd like to be able to set the latter based on 
said macro.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-12 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In D82087#2146170 , @sameerds wrote:

> >> https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions
> > 
> > I think if the language interface insists on fixing the wave size, then I 
> > think the correct solution is to implement this in the header based on a 
> > wave size macro (which we're desperately missing). The library 
> > implementation should be responsible for inserting the extension to 64-bit 
> > for wave32


Agree that FE should have a predefined macro for wave front size. Since it is 
per amdgpu target and not per language, it should be named as 
__amdgpu_wavefront_size__ or something similar, then it could be used by all 
languages.

Then we need to initialize warpSize in HIP header by this macro 
https://github.com/ROCm-Developer-Tools/HIP/blob/386a0e0123d67b95b4c0ebb3ebcf1d1615758146/include/hip/hcc_detail/device_functions.h#L300

I tends to think we should define __ballot in HIP header conditionally by the 
wavefront size so that the return type is correct for both wave32 and wave64 
mode. We should assume normal HIP compilation always have -mcpu specified so 
that wavefront size is known at compile time. If -mcpu is not specified 
probably we should not define warpSize or __ballot.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-12 Thread Sameer Sahasrabuddhe via Phabricator via cfe-commits
sameerds added a comment.

>> https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions
> 
> I think if the language interface insists on fixing the wave size, then I 
> think the correct solution is to implement this in the header based on a wave 
> size macro (which we're desperately missing). The library implementation 
> should be responsible for inserting the extension to 64-bit for wave32

Not sure if the frontend should try to infer warpsize and the mask size, or 
even whether it can in all cases. But this can result in wrong behaviour when 
the program passes 32-bit mask but then gets compiled for a 64-bit mask. It's 
easy to say that the programmer must not assume a warp-size, but it would be 
useful if the language can

In D82087#2144712 , @arsenm wrote:

> In D82087#2140778 , @sameerds wrote:
>
> > The documentation for HIP __ballot seems to indicate that the user does not 
> > have to explicitly specify the warp size. How is that achieved with these 
> > new builtins? Can this  be captured in a lit test here?
>
>
> This seems like a defect in the design to me
>
> > https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions


I tend to agree. The HIP vote/ballot builtins are also missing a mask 
parameter, whose type needs to match the wavesize.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-10 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D82087#2140778 , @sameerds wrote:

> The documentation for HIP __ballot seems to indicate that the user does not 
> have to explicitly specify the warp size. How is that achieved with these new 
> builtins? Can this  be captured in a lit test here?
>
> https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions


I think if the language interface insists on fixing the wave size, then I think 
the correct solution is to implement this in the header based on a wave size 
macro (which we're desperately missing). The library implementation should be 
responsible for inserting the extension to 64-bit for wave32


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-10 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm marked 2 inline comments as done.
arsenm added inline comments.



Comment at: clang/lib/Basic/Targets/AMDGPU.cpp:288
+  if (!IsNullCPU) {
+// Default to wave32 if available, or wave64 if not
+if (Features.count("wavefrontsize32") == 0 &&

sameerds wrote:
> So the implication here is that wave32 is the preferred choice on newer 
> architectures, and hence the default when available?
Yes, this has always been the case



Comment at: clang/lib/Basic/Targets/AMDGPU.cpp:293
+"wavefrontsize32" : "wavefrontsize64";
+  Features.insert(std::make_pair(DefaultWaveSizeFeature, true));
+}

yaxunl wrote:
> what's the default wave front size in backend for gfx10* before this change?
It's always been wave32


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-10 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

In D82087#2140778 , @sameerds wrote:

> The documentation for HIP __ballot seems to indicate that the user does not 
> have to explicitly specify the warp size. How is that achieved with these new 
> builtins? Can this  be captured in a lit test here?


This seems like a defect in the design to me

> https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions




CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-09 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/Basic/Targets/AMDGPU.cpp:293
+"wavefrontsize32" : "wavefrontsize64";
+  Features.insert(std::make_pair(DefaultWaveSizeFeature, true));
+}

what's the default wave front size in backend for gfx10* before this change?



Comment at: clang/test/CodeGenOpenCL/amdgpu-features.cl:7
+// RUN: %clang_cc1 -triple amdgcn -S -emit-llvm -o - %s | FileCheck 
--check-prefix=NOCPU %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize32 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE32 %s
+// RUN: %clang_cc1 -triple amdgcn -target-feature +wavefrontsize64 -S 
-emit-llvm -o - %s | FileCheck --check-prefix=NOCPU-WAVE64 %s

what happens if both +wavefrontsize32 and +wavefrontsize64 are specified?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-08 Thread Sameer Sahasrabuddhe via Phabricator via cfe-commits
sameerds added a comment.

The documentation for HIP __ballot seems to indicate that the user does not 
have to explicitly specify the warp size. How is that achieved with these new 
builtins? Can this  be captured in a lit test here?

https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#warp-vote-and-ballot-functions




Comment at: clang/lib/Basic/Targets/AMDGPU.cpp:288
+  if (!IsNullCPU) {
+// Default to wave32 if available, or wave64 if not
+if (Features.count("wavefrontsize32") == 0 &&

So the implication here is that wave32 is the preferred choice on newer 
architectures, and hence the default when available?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-07-06 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

ping


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-06-23 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

ping


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82087/new/

https://reviews.llvm.org/D82087



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82087: AMDGPU/clang: Add builtins for llvm.amdgcn.ballot

2020-06-18 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm created this revision.
arsenm added reviewers: yaxunl, rampitec, b-sumner, foad, nhaehnle.
Herald added subscribers: kerbowa, t-tye, tpr, dstuttard, wdng, jvesely, 
kzhuravl.

I wasn't sure what the best strategy was for the wave size
difference. I went for an explicit, enforced builtin for each. The
other option would be to just assume wave64, and IRGen the different
mangling + zext. I didn't see an obvious way to check the wave size
where builtins are emitted, and it might be beneficial to force you to
acknowledge the wave size difference? Or it might be an unnecessary
complication.

The behavior is also slightly odd when directly specifying
-target-feature to cc1 for +/- the size (since we have both positive
and negative forms of both sizes), but this is probably unimportant.

We're also still missing a predefined macro for the wave size, which
we probably need.


https://reviews.llvm.org/D82087

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/Basic/Targets/AMDGPU.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/amdgpu-features.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
  clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl

Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave64.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize32 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -verify -S -o - %s
+
+typedef unsigned long ulong;
+
+void test_ballot_wave64(global ulong* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_wave64(a == b);  // expected-error {{'__builtin_amdgcn_ballot_wave64' needs target feature wavefrontsize64}}
+}
Index: clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
===
--- /dev/null
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-wave32.cl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature +wavefrontsize64 -verify -S -o - %s
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1010 -target-feature -wavefrontsize32 -verify -S -o - %s
+
+typedef unsigned int uint;
+
+void test_ballot_wave32(global uint* out, int a, int b) {
+  *out = __builtin_amdgcn_ballot_wave32(a == b);  // expected-error {{'__builtin_amdgcn_ballot_wave32' needs target feature wavefrontsize32}}
+}
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// XUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature -wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+
+typedef unsigned long ulong;
+
+// CHECK-LABEL: @test_ballot_wave64(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 %{{.+}})
+void test_ballot_wave64(global ulong* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ballot_wave64(a == b);
+}
+
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
===
--- /dev/null
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck -enable-var-scope %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @test_ballot_wave32(
+// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
+void test_ballot_wave32(global uint* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ballot_wave32(a == b);
+}
+
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
===
--- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -22,3 +22,10 @@
 void test_mov_dpp8(global uint* out, uint a) {
   *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
+
+// CHECK-LABEL: