[clang] [clang][CodeGen] Add query for a target's flat address space (PR #95728)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -1764,6 +1764,13 @@ class TargetInfo : public TransferrableTargetInfo,
 return 0;
   }
 
+  /// \returns Target specific flat ptr address space; a flat ptr is a ptr that
+  /// can be casted to / from all other target address spaces. If the target
+  /// exposes no such address space / does not care, we return 0.

arsenm wrote:

> here's no such thing as LangAS::Generic, there's LangAS::Default.

Right, so query the target address space map for that. 

The IR does not have the concept of a generic, flat, or no address space. 0 is 
the "default address space", which has its own specific properties. 

> this merely maintains the status quo, wherein everybody grabs an AS 0 pointer 
> and hopes for the best.

For the most part this isn't true, at least in llvm proper. 

>From the comment here, I'm not even sure what this is returning (either an IR 
>address space or a clang address space) 



https://github.com/llvm/llvm-project/pull/95728
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] [WIP] Added builtin_alloca support for OpenCL1.2 and below (PR #95750)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,86 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL12 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL20 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL30 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 
-cl-ext=+__opencl_c_generic_address_space -emit-llvm -o - | FileCheck 
--check-prefix=OPENCL30-EXT %s
+
+// OPENCL12-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL12-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL12-NEXT:  [[ENTRY:.*:]]
+// OPENCL12-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL12-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL12-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL20-LABEL: define dso_local ptr @test1(
+// OPENCL20-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL20-NEXT:  [[ENTRY:.*:]]
+// OPENCL20-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to 
ptr
+// OPENCL20-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 
8
+// OPENCL20-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL20-NEXT:ret ptr [[TMP2]]
+//
+// OPENCL30-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL30-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL30-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL30-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL30-EXT-LABEL: define dso_local ptr @test1(
+// OPENCL30-EXT-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-EXT-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-EXT-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, 
addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) 
[[TMP0]] to ptr
+// OPENCL30-EXT-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL30-EXT-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) 
[[ALLOC_PTR]], align 8
+// OPENCL30-EXT-NEXT:ret ptr [[TMP2]]
+//
+float* test1() {
+float* alloc_ptr = (float*)__builtin_alloca(32 * sizeof(int));

arsenm wrote:

Yes, this is fundamentally a stack / private pointer 

https://github.com/llvm/llvm-project/pull/95750
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --function-signature
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s

arsenm wrote:

Not much point in testing all the subtargets here

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --function-signature
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s
+
+typedef struct AA_ty {
+  int x;
+  __amdgcn_buffer_rsrc_t r;
+} AA;
+
+AA getAA(void *p);
+__amdgcn_buffer_rsrc_t getBufferImpl(void *p);
+void consumeBuffer(__amdgcn_buffer_rsrc_t);
+
+// CHECK-LABEL: define {{[^@]+}}@getBuffer

arsenm wrote:

I guess that covers it, but we don't get to see the return attributes now 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,9 @@
+

arsenm wrote:

Extra blank line 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 -triple amdgcn 
-Wno-unused-value %s
+

arsenm wrote:

We probably want another similar sema test for OpenCL/HIP/OpenMP 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][CodeGen] Add query for a target's flat address space (PR #95728)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -1764,6 +1764,13 @@ class TargetInfo : public TransferrableTargetInfo,
 return 0;
   }
 
+  /// \returns Target specific flat ptr address space; a flat ptr is a ptr that
+  /// can be casted to / from all other target address spaces. If the target
+  /// exposes no such address space / does not care, we return 0.

arsenm wrote:

This further spreads the notion that 0 is a lack of address space, which it is 
not. You shouldn't need a new thing for this; it should be equivalent to 
querying the target address space map for the LangAS Generic thing 

https://github.com/llvm/llvm-project/pull/95728
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] [WIP] Added builtin_alloca support for OpenCL1.2 and below (PR #95750)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,86 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL12 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL20 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL30 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 
-cl-ext=+__opencl_c_generic_address_space -emit-llvm -o - | FileCheck 
--check-prefix=OPENCL30-EXT %s
+
+// OPENCL12-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL12-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL12-NEXT:  [[ENTRY:.*:]]
+// OPENCL12-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL12-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL12-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL20-LABEL: define dso_local ptr @test1(
+// OPENCL20-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL20-NEXT:  [[ENTRY:.*:]]
+// OPENCL20-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to 
ptr
+// OPENCL20-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 
8
+// OPENCL20-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL20-NEXT:ret ptr [[TMP2]]
+//
+// OPENCL30-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL30-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL30-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL30-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL30-EXT-LABEL: define dso_local ptr @test1(
+// OPENCL30-EXT-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-EXT-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-EXT-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, 
addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) 
[[TMP0]] to ptr
+// OPENCL30-EXT-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL30-EXT-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) 
[[ALLOC_PTR]], align 8
+// OPENCL30-EXT-NEXT:ret ptr [[TMP2]]
+//
+float* test1() {
+float* alloc_ptr = (float*)__builtin_alloca(32 * sizeof(int));
+return alloc_ptr;
+}
+
+// OPENCL12-LABEL: define dso_local void @test2(
+// OPENCL12-SAME: ) #[[ATTR0]] {
+// OPENCL12-NEXT:  [[ENTRY:.*:]]
+// OPENCL12-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL12-NEXT:[[TMP0:%.*]] = alloca i8, i64 28, align 8, addrspace(5)
+// OPENCL12-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:ret void
+//
+// OPENCL20-LABEL: define dso_local void @test2(
+// OPENCL20-SAME: ) #[[ATTR0]] {
+// OPENCL20-NEXT:  [[ENTRY:.*:]]
+// OPENCL20-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP0:%.*]] = alloca i8, i64 28, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to 
ptr
+// OPENCL20-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 
8
+// OPENCL20-NEXT:ret void
+//
+// OPENCL30-LABEL: define dso_local void @test2(
+// OPENCL30-SAME: ) #[[ATTR0]] {
+// OPENCL30-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL30-NEXT:[[TMP0:%.*]] = alloca i8, i64 28, align 8, addrspace(5)
+// OPENCL30-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:ret void
+//
+// OPENCL30-EXT-LABEL: define dso_local void @test2(
+// OPENCL30-EXT-SAME: ) #[[ATTR0]] {
+// OPENCL30-EXT-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-EXT-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP0:%.*]] = alloca i8, i64 28, align 8, 
addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) 
[[TMP0]] to ptr
+// OPENCL30-EXT-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL30-EXT-NEXT:ret void
+//
+void test2() {
+void *alloc_ptr = __builtin_alloca(28);


[clang] [Clang] [WIP] Added builtin_alloca support for OpenCL1.2 and below (PR #95750)

2024-06-17 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,86 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL12 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL2.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL20 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 -emit-llvm 
-o - | FileCheck --check-prefix=OPENCL30 %s
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL3.0 
-cl-ext=+__opencl_c_generic_address_space -emit-llvm -o - | FileCheck 
--check-prefix=OPENCL30-EXT %s
+
+// OPENCL12-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL12-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL12-NEXT:  [[ENTRY:.*:]]
+// OPENCL12-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL12-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL12-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL12-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL20-LABEL: define dso_local ptr @test1(
+// OPENCL20-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL20-NEXT:  [[ENTRY:.*:]]
+// OPENCL20-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL20-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[TMP0]] to 
ptr
+// OPENCL20-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], align 
8
+// OPENCL20-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL20-NEXT:ret ptr [[TMP2]]
+//
+// OPENCL30-LABEL: define dso_local ptr addrspace(5) @test1(
+// OPENCL30-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr addrspace(5), align 4, 
addrspace(5)
+// OPENCL30-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, addrspace(5)
+// OPENCL30-NEXT:store ptr addrspace(5) [[TMP0]], ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:[[TMP1:%.*]] = load ptr addrspace(5), ptr addrspace(5) 
[[ALLOC_PTR]], align 4
+// OPENCL30-NEXT:ret ptr addrspace(5) [[TMP1]]
+//
+// OPENCL30-EXT-LABEL: define dso_local ptr @test1(
+// OPENCL30-EXT-SAME: ) #[[ATTR0:[0-9]+]] {
+// OPENCL30-EXT-NEXT:  [[ENTRY:.*:]]
+// OPENCL30-EXT-NEXT:[[ALLOC_PTR:%.*]] = alloca ptr, align 8, addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP0:%.*]] = alloca i8, i64 128, align 8, 
addrspace(5)
+// OPENCL30-EXT-NEXT:[[TMP1:%.*]] = addrspacecast ptr addrspace(5) 
[[TMP0]] to ptr
+// OPENCL30-EXT-NEXT:store ptr [[TMP1]], ptr addrspace(5) [[ALLOC_PTR]], 
align 8
+// OPENCL30-EXT-NEXT:[[TMP2:%.*]] = load ptr, ptr addrspace(5) 
[[ALLOC_PTR]], align 8
+// OPENCL30-EXT-NEXT:ret ptr [[TMP2]]
+//
+float* test1() {
+float* alloc_ptr = (float*)__builtin_alloca(32 * sizeof(int));

arsenm wrote:

Should test with __private qualified pointer, we want to see that there is no 
cast 

https://github.com/llvm/llvm-project/pull/95750
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] clang/AMDGPU: Emit atomicrmw from ds_fadd builtins (PR #95395)

2024-06-15 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/95395
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-14 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,65 @@
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 
-verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+
+; CHECK-LABEL: name:basic_readfirstlane_i64
+;   CHECK:[[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR

arsenm wrote:

My worry is this test won't get updated whenever that's fixed. How about you 
leave the existing checks, but add an additional "not --crash" run line that 
checks the verifier error? 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,21 @@
+//===-- AMDGPUTypes.def - Metadata about AMDGPU types ---*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+//  This file defines various AMDGPU builtin types.
+//
+//===--===//
+
+#ifndef AMDGPU_OPAQUE_PTR_TYPE
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, MangledName, AS, Width, Align, Id, 
SingletonId) \
+  AMDGPU_TYPE(Name, Id, SingletonId)
+#endif
+
+AMDGPU_OPAQUE_PTR_TYPE("__amdgcn_buffer_rsrc_t", "__amdgcn_buffer_rsrc_t", 8, 
128, 128, AMDGPUBufferRsrc, AMDGPUBufferRsrcTy)

arsenm wrote:

Things are gradually moving to use amdgpu 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --function-signature
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s
+
+typedef struct AA_ty {
+  int x;
+  __amdgcn_buffer_rsrc_t r;
+} AA;
+
+AA getAA(void *p);
+__amdgcn_buffer_rsrc_t getBufferImpl(void *p);
+void consumeBuffer(__amdgcn_buffer_rsrc_t);
+
+// CHECK-LABEL: define {{[^@]+}}@getBuffer

arsenm wrote:

This is not showing the return type 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,21 @@
+//===-- AMDGPUTypes.def - Metadata about AMDGPU types ---*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+//  This file defines various AMDGPU builtin types.
+//
+//===--===//
+
+#ifndef AMDGPU_OPAQUE_PTR_TYPE
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, MangledName, AS, Width, Align, Id, 
SingletonId) \
+  AMDGPU_TYPE(Name, Id, SingletonId)
+#endif
+
+AMDGPU_OPAQUE_PTR_TYPE("__amdgcn_buffer_rsrc_t", "__amdgcn_buffer_rsrc_t", 8, 
128, 128, AMDGPUBufferRsrc, AMDGPUBufferRsrcTy)

arsenm wrote:

Should use amdgpu not amdgcn 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,65 @@
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 
-verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+
+; CHECK-LABEL: name:basic_readfirstlane_i64
+;   CHECK:[[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR

arsenm wrote:

I thought you switched to using the intrinsics such that this would avoid the 
verifier error. This test is going to fail if you are still hitting the error 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -6129,13 +6150,55 @@ static SDValue lowerLaneOp(const SITargetLowering , 
SDNode *N,
   if (ValSize % 32 != 0)
 return SDValue();
 
+  auto unrollLaneOp = [, ](SDNode *N) -> SDValue {
+EVT VT = N->getValueType(0);
+unsigned NE = VT.getVectorNumElements();
+EVT EltVT = VT.getVectorElementType();
+SmallVector Scalars;
+unsigned NumOperands = N->getNumOperands();
+SmallVector Operands(NumOperands);
+SDNode *GL = N->getGluedNode();
+
+if (GL) {
+  // only handle convegrencectrl_glue
+  assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
+}

arsenm wrote:

Typo convegrencectrl_glue. Also assert(!GL || ...)

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-14 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,65 @@
+; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 
-verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
+
+; CHECK-LABEL: name:basic_readfirstlane_i64
+;   CHECK:[[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR

arsenm wrote:

We don't need these isel checks for the tokens if the IR verifier is going to 
catch the breakage. This test can jus be merged with the existing intrinsic 
tests 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga 
-emit-llvm -o - %s | FileCheck %s
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s
+
+typedef struct AA_ty {
+  int x;
+  __amdgcn_buffer_rsrc_t r;
+} AA;
+
+AA getAA(void *p);
+__amdgcn_buffer_rsrc_t getBuffer(void *p);
+void consumeBuffer(__amdgcn_buffer_rsrc_t);
+
+// CHECK-LABEL: @consumeBufferPtr(

arsenm wrote:

The interesting check here is the IR type in the function signature which are 
missing. Also should test it as a return value (and return value in a struct)

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [compiler-rt] [flang] [libc] [lld] [lldb] [llvm] [mlir] [openmp] [llvm-project] Fix typo "seperate" (PR #95373)

2024-06-13 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/95373
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation , 
const JobAction ,
 CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+// Add llvm_wrappers/* to our system include path.  This lets us wrap
+// standard library headers and other headers.
+SmallString<128> P(D.ResourceDir);
+llvm::sys::path::append(P, "include");
+llvm::sys::path::append(P, "llvm_offload_wrappers");
+CmdArgs.push_back("-internal-isystem");
+CmdArgs.push_back(Args.MakeArgString(P));
+
+CmdArgs.push_back("-include");
+if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+  CmdArgs.push_back("__llvm_offload_device.h");
+else
+  CmdArgs.push_back("__llvm_offload_host.h");

arsenm wrote:

Push pack select of string name? 

https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Introduce the concept of "default streams" (PR #95371)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation , 
const JobAction ,
 CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+// Add llvm_wrappers/* to our system include path.  This lets us wrap
+// standard library headers and other headers.
+SmallString<128> P(D.ResourceDir);
+llvm::sys::path::append(P, "include");
+llvm::sys::path::append(P, "llvm_offload_wrappers");

arsenm wrote:

I think append is variadic and you can do both pieces in one call 

https://github.com/llvm/llvm-project/pull/95371
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-13 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Just a note - and maybe this was already discussed above - is there good 
> reason not to explicitly make this type a 128-bit scalar? The LLVM data 
> layout already does this

I thought this was the 160 bit version? 

Can we have an opaque-but-sized type? The concern is exposing this as a raw 
pointer right away 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a builtin for llvm.amdgcn.make.buffer.rsrc intrinsic (PR #95276)

2024-06-13 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
verde -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
tonga -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
gfx1100 -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call ptr addrspace(8) 
@llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 
[[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:ret ptr addrspace(8) [[TMP0]]
+//
+__buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int 
num, int flags) {
+  return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);

arsenm wrote:

The context there is slightly different. They are using the raw descriptor, 
which is addrspace(7), not 8. But the same stands, it probably should be 
passable around as a value 

https://github.com/llvm/llvm-project/pull/95276
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a builtin for llvm.amdgcn.make.buffer.rsrc intrinsic (PR #95276)

2024-06-13 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> I understand the chance of conflict is low. It may be like the chance of 
> hitting by a meteor. However, if we prefix with `__amdgcn_`, there is no such 
> risk. And we have the benefit to clearly indicate it is a amdgcn 
> target-specific type.

Should use amdgpu 

https://github.com/llvm/llvm-project/pull/95276
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a builtin for llvm.amdgcn.make.buffer.rsrc intrinsic (PR #95276)

2024-06-12 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
verde -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
tonga -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -cl-std=CL2.0 -target-cpu 
gfx1100 -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call ptr addrspace(8) 
@llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 
[[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:ret ptr addrspace(8) [[TMP0]]
+//
+__buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int 
num, int flags) {
+  return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);

arsenm wrote:

The alternative would be to have a builtin address space fat pointer, but I 
assume that will be substantially more implementation work. 

https://github.com/llvm/llvm-project/pull/95276
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a builtin for llvm.amdgcn.make.buffer.rsrc intrinsic (PR #95276)

2024-06-12 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm 
-o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm 
-o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[TMP0:%.*]] = tail call ptr addrspace(8) 
@llvm.amdgcn.make.buffer.rsrc.p5(ptr addrspace(5) [[P:%.*]], i16 4, i32 
[[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:ret ptr addrspace(8) [[TMP0]]
+//
+ __buffer_rsrc_t test_amdgcn_make_buffer_rsrc(void *p, int num, int flags) {
+   return __builtin_amdgcn_make_buffer_rsrc(p, 4, num, flags);

arsenm wrote:

Test more combinations of arguments. All arguments can be variable, but you 
have a constant here 

Also test with different address space pointers. We also probably should have a 
HIP test. Ideally we would error for private/local inputs.

Also test null and literal pointer inputs? 

https://github.com/llvm/llvm-project/pull/95276
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][Driver] Add HIPAMD Driver support for AMDGCN flavoured SPIR-V (PR #95061)

2024-06-12 Thread Matt Arsenault via cfe-commits


@@ -128,12 +128,13 @@ enum class CudaArch {
   GFX12_GENERIC,
   GFX1200,
   GFX1201,
+  AMDGCNSPIRV,
   Generic, // A processor model named 'generic' if the target backend defines a
// public one.
   LAST,
 
   CudaDefault = CudaArch::SM_52,
-  HIPDefault = CudaArch::GFX906,
+  HIPDefault = CudaArch::AMDGCNSPIRV,

arsenm wrote:

Behavior change should definitely be done separately 

https://github.com/llvm/llvm-project/pull/95061
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-12 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Or drop the new nodes altogether and legelaize to intrinsics directly ?

That's another option. The only real plus to the intermediate is it's slightly 
less annoying to write combines for. But there are limited combining 
opportunities for these 



https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-12 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,46 @@
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -verify-machineinstrs -o 
/dev/null %s 2>&1 | FileCheck %s

arsenm wrote:

I'd still test all 3, but yes an IR test 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-12 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,46 @@
+# RUN: not --crash llc -mtriple=amdgcn -run-pass=none -verify-machineinstrs -o 
/dev/null %s 2>&1 | FileCheck %s

arsenm wrote:

You should not need to introduce any new machine verifier tests, they are not 
useful. The useful test would be the IR that produces the verifier error, which 
ideally would be fixed 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-10 Thread Matt Arsenault via cfe-commits


@@ -2201,6 +2207,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const 
{
 Align = 8; 
\
 break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+case BuiltinType::AMDGPUBufferRsrc:
+  Width = 0;
+  Align = 256;

arsenm wrote:

If we expose this as a pointer there are a lot of sharp edges we need to fix. 
If we expose it as an opaque type for builtins, the broken cases are hidden 
from end users. 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-10 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,11 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __buffer_rsrc_t v = 0; // expected-error {{cannot initialize a variable of 
type '__buffer_rsrc_t' with an rvalue of type 'int'}}
+  static_cast<__buffer_rsrc_t>(n); // expected-error {{static_cast from 'int' 
to '__buffer_rsrc_t' is not allowed}}
+  dynamic_cast<__buffer_rsrc_t>(n); // expected-error {{invalid target type 
'__buffer_rsrc_t' for dynamic_cast; target type must be a reference or pointer 
type to a defined class}}
+  reinterpret_cast<__buffer_rsrc_t>(n); // expected-error {{reinterpret_cast 
from 'int' to '__buffer_rsrc_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' 
with an lvalue of type '__buffer_rsrc_t'}}
+}

arsenm wrote:

Also test some c style casts to other pointer types 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-10 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,11 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __buffer_rsrc_t v = 0; // expected-error {{cannot initialize a variable of 
type '__buffer_rsrc_t' with an rvalue of type 'int'}}
+  static_cast<__buffer_rsrc_t>(n); // expected-error {{static_cast from 'int' 
to '__buffer_rsrc_t' is not allowed}}
+  dynamic_cast<__buffer_rsrc_t>(n); // expected-error {{invalid target type 
'__buffer_rsrc_t' for dynamic_cast; target type must be a reference or pointer 
type to a defined class}}
+  reinterpret_cast<__buffer_rsrc_t>(n); // expected-error {{reinterpret_cast 
from 'int' to '__buffer_rsrc_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' 
with an lvalue of type '__buffer_rsrc_t'}}
+}

arsenm wrote:

What about sizeof / alignof? 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-10 Thread Matt Arsenault via cfe-commits


@@ -2200,6 +2206,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const 
{
 Align = 8; 
\
 break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+case BuiltinType::AMDGPUBufferRsrc:
+  Width = 128;
+  Align = 128;

arsenm wrote:

Sizeless object is probably the safest option for now. It shouldn't be a 
vector. I foresee more difficulty introducing a wide pointer, and if we change 
our minds on how fat pointers should work it's harder to change the size later 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload][CUDA] Add initial cuda_runtime.h overlay (PR #94821)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,30 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO

arsenm wrote:

Better to enable supported cases? 

https://github.com/llvm/llvm-project/pull/94821
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,9 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple amdgcn %s -emit-llvm -o - 
| FileCheck %s

arsenm wrote:

Why do you need -fclang-abi-compat=latest

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,21 @@
+//===-- AMDGPUTypes.def - Metadata about AMDGPU types ---*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+//  This file defines various AMDGPU builtin types.
+//
+//===--===//
+
+#ifndef AMDGPU_OPAQUE_TYPE
+#define AMDGPU_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) 
\
+  AMDGPU_TYPE(Name, Id, SingletonId)
+#endif
+
+AMDGPU_OPAQUE_TYPE("__buffer_rsrc_t", "__buffer_rsrc_t", AMDGPUBufferRsrc, 
AMDGPUBufferRsrcTy)

arsenm wrote:

Should it include an amdgpu prefix? 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -2200,6 +2206,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const 
{
 Align = 8; 
\
 break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+case BuiltinType::AMDGPUBufferRsrc:
+  Width = 128;
+  Align = 128;

arsenm wrote:

If we were exposing the pointer, it would be 160/192 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm commented:

Need stacked PR that adds the make_buffer_rsrc builtin that shows its use 

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add a new builtin type for buffer rsrc (PR #94830)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -1091,6 +1091,9 @@ enum PredefinedTypeIDs {
 // \brief WebAssembly reference types with auto numeration
 #define WASM_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+// \breif AMDGPU types with auto numeration

arsenm wrote:

Typo breif

https://github.com/llvm/llvm-project/pull/94830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less 
than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
 IEEE 754-2019.
 
+.. _i_minimumnum:
+
+'``llvm.minimumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.minimumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.minimumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.minimumnum.*``' intrinsics return the minimum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs (including sNaN), returns qNaN. If one operand
+is NaN (including sNaN) and another operand is a number, return the number.
+Otherwise returns the lesser of the two arguments. -0.0 is considered to
+be less than +0.0 for this intrinsic.
+
+Note that these are the semantics of minimumNumber specified in IEEE 754-2019.
+
+.. _i_maximumnum:
+
+'``llvm.maximumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.maximumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.maximumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.maximumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.maximumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.maximumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.maximumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.maximumnum.*``' intrinsics return the maximum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs (including sNaN), returns qNaN. If one operand
+is NaN (including sNaN) and another operand is a number, return the number.
+Otherwise returns the greater of the two arguments. -0.0 is considered to
+be less than +0.0 for this intrinsic.
+
+Note that these are the semantics of minimumNumber specified in IEEE 754-2019.

arsenm wrote:

Copy paste error minimumNumber. Also like above, this is not the signaling nan 
behavior (where the behavior inverts from quiet nan) 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less 
than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
 IEEE 754-2019.
 
+.. _i_minimumnum:
+
+'``llvm.minimumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.minimumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.minimumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.minimumnum.*``' intrinsics return the minimum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs (including sNaN), returns qNaN. If one operand
+is NaN (including sNaN) and another operand is a number, return the number.
+Otherwise returns the lesser of the two arguments. -0.0 is considered to
+be less than +0.0 for this intrinsic.
+
+Note that these are the semantics of minimumNumber specified in IEEE 754-2019.
+
+.. _i_maximumnum:
+
+'``llvm.maximumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.maximumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.maximumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.maximumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.maximumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.maximumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.maximumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.maximumnum.*``' intrinsics return the maximum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs (including sNaN), returns qNaN. If one operand
+is NaN (including sNaN) and another operand is a number, return the number.
+Otherwise returns the greater of the two arguments. -0.0 is considered to
+be less than +0.0 for this intrinsic.
+
+Note that these are the semantics of minimumNumber specified in IEEE 754-2019.

arsenm wrote:

Should also state the explicit difference here, on the intrinsic, the 
comparison to minnum 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -15874,6 +15874,96 @@ The returned value is completely identical to the 
input except for the sign bit;
 in particular, if the input is a NaN, then the quiet/signaling bit and payload
 are perfectly preserved.
 
+.. _i_fminmax_family:
+
+'``llvm.min.*``' Intrinsics Comparation
+^^^
+
+Standard:
+"
+
+IEEE754 and ISO C define some min/max operations, and they have some 
differences
+on working with qNaN/sNaN and +0.0/-0.0. Here is the list:
+
+.. list-table::
+   :header-rows: 2
+
+   * - ``ISO C``
+ - fmin/fmax
+ - none
+ - fmininum/fmaximum
+ - fminimum_num/fmaximum_num
+
+   * - ``IEEE754``
+ - none
+ - nimNUM/maxNUM (2008)
+ - minimum/maximum (2019)
+ - minimumNumber/maximumNumber (2019)
+
+   * - ``+0.0 vs -0.0``
+ - either one
+ - +0.0 > -0.0
+ - +0.0 > -0.0
+ - +0.0 > -0.0
+
+   * - ``NUM vs sNaN``
+ - qNaN, invalid excpetion
+ - qNaN, invalid excpetion
+ - qNaN, invalid excpetion
+ - NUM, invalid excpetion

arsenm wrote:

Typo exception throughout 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -15874,6 +15874,96 @@ The returned value is completely identical to the 
input except for the sign bit;
 in particular, if the input is a NaN, then the quiet/signaling bit and payload
 are perfectly preserved.
 
+.. _i_fminmax_family:
+
+'``llvm.min.*``' Intrinsics Comparation
+^^^
+
+Standard:
+"
+
+IEEE754 and ISO C define some min/max operations, and they have some 
differences
+on working with qNaN/sNaN and +0.0/-0.0. Here is the list:
+
+.. list-table::
+   :header-rows: 2
+
+   * - ``ISO C``
+ - fmin/fmax
+ - none
+ - fmininum/fmaximum
+ - fminimum_num/fmaximum_num
+
+   * - ``IEEE754``
+ - none
+ - nimNUM/maxNUM (2008)
+ - minimum/maximum (2019)
+ - minimumNumber/maximumNumber (2019)
+
+   * - ``+0.0 vs -0.0``
+ - either one
+ - +0.0 > -0.0
+ - +0.0 > -0.0
+ - +0.0 > -0.0
+
+   * - ``NUM vs sNaN``
+ - qNaN, invalid excpetion
+ - qNaN, invalid excpetion
+ - qNaN, invalid excpetion
+ - NUM, invalid excpetion
+
+   * - ``NUM vs qNaN``
+ - NUM, no excpetion
+ - NUM, no excpetion
+ - qNaN, no excpetion
+ - NUM, no excpetion
+

arsenm wrote:

cover nan vs. nan case 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -16055,6 +16145,90 @@ of the two arguments. -0.0 is considered to be less 
than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
 IEEE 754-2019.
 
+.. _i_minimumnum:
+
+'``llvm.minimumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.minimumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.minimumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.minimumnum.*``' intrinsics return the minimum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs (including sNaN), returns qNaN. If one operand
+is NaN (including sNaN) and another operand is a number, return the number.
+Otherwise returns the lesser of the two arguments. -0.0 is considered to
+be less than +0.0 for this intrinsic.
+
+Note that these are the semantics of minimumNumber specified in IEEE 754-2019.

arsenm wrote:

This is not the IEEE semantics for signaling nan. For a signaling nan, returns 
a quiet nan, not the non-non operand 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-07 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> "aggregates" here might even be unusual cases like `<4 x i8>`

Vectors aren't aggregates and are more reasonable 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-07 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> `voffset` and `soffset` are "offset that goes in VGPRs" and "offset that goes 
> in SGPRs", with the latter having some different bounds-checking semantics on 
> ... at least some of the gfx9's, IIRC.
> 

Right, that's the problem. We need to know the parameters of the SRD in order 
to make use of the scalar offset. Ideally we would have one pointer operand and 
be able to addressing mode match into soffset/voffset/imm 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-07 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> 2. What I mean is that "types that work" isn't the right framing: any type 
> can be legalized to one or more types that work. That is, down in the isel 
> legalizer, if I call for, for example
>```llvm
>%0 = call {i64, i64, i8} @llvm.amdgcn.raw.buffer.ptr.load(ptr addrspace(8) 
> %rsrc, i32 %off, ...)
>```

Handling arbitrary aggregates here isn't really reasonable or necessary. We can 
restrict this to a reasonable set of legal-ish types 


https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-07 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> 1. For the swizzled case, that's `struct.ptr.buffer.*`, and yeah, those will 
> always need builtins because LLVM can't deal in 2D addressing schemes

But the raw buffer intrinsics have both the soffset and voffset parameters 
though? Not just the struct 



https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-07 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Actually, even ignoring address space 7, it feels like these builtins if you 
> could `raw.ptr.buffer.store` any type you liked, and then they could be 
> type-varying in Clang?

We could either have a builtin for all the types that would work, or if we want 
to treat them more like a normal pointer, clang could verify you only use them 
with types that will work 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types.
+  // There is no representation for Inf or NaN.
+  NoNanInf,

arsenm wrote:

Invert and call SupportsNonFinite? 

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -878,6 +896,10 @@ void IEEEFloat::copySignificand(const IEEEFloat ) {
for the significand.  If double or longer, this is a signalling NaN,
which may not be ideal.  If float, this is QNaN(0).  */
 void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NoNanInf) {
+assert(false && "This floating point format does not support NaN\n");
+return;

arsenm wrote:

llvm_unreachable, also no dead return 

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add timeout for GPU detection utilities (PR #94751)

2024-06-07 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/94751
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Add timeout for GPU detection utilities (PR #94751)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -205,7 +205,7 @@ class ToolChain {
 
   /// Executes the given \p Executable and returns the stdout.
   llvm::Expected>
-  executeToolChainProgram(StringRef Executable) const;
+  executeToolChainProgram(StringRef Executable, unsigned Timeout = 0) const;

arsenm wrote:

Name this SecondsToWait to match ExecuteAndWait? 

https://github.com/llvm/llvm-project/pull/94751
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -1881,6 +1890,20 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+  expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+  expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 

arsenm wrote:

Should also test getZero and the other special case constructors 

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [APFloat] Add APFloat support for FP6 data types (PR #94735)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -47,6 +47,10 @@ static std::string convertToString(double d, unsigned Prec, 
unsigned Pad,
   return std::string(Buffer.data(), Buffer.size());
 }
 
+static bool hasNanOrInf(APFloat::Semantics S) {
+  return (S != APFloat::S_Float6E3M2FN) && (S != APFloat::S_Float6E2M3FN);
+}

arsenm wrote:

Probably would be useful as a helper somewhere in APFloat or Semantics 

https://github.com/llvm/llvm-project/pull/94735
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals are fake (PR #93601)

2024-06-07 Thread Matt Arsenault via cfe-commits


@@ -8642,8 +8642,11 @@ The '``llvm.used``' Global Variable
 The ``@llvm.used`` global is an array which has
 :ref:`appending linkage `. This array contains a list of
 pointers to named global variables, functions and aliases which may optionally
-have a pointer cast formed of bitcast or getelementptr. For example, a legal
-use of it is:
+have a pointer cast formed of bitcast or getelementptr. The address space of 
the
+pointers is always unspecified, rather than the globals address space, since

arsenm wrote:

Do not say unspecified. Say it is the default address space. 0 is not an 
unspecified address space 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm commented:

Is this redundant with #93601?

https://github.com/llvm/llvm-project/pull/93914
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-06-06 Thread Matt Arsenault via cfe-commits

arsenm wrote:

Commit message also needs to be updated 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -2922,18 +2922,19 @@ static void emitUsed(CodeGenModule , StringRef Name,
   if (List.empty())
 return;
 
+  llvm::Type *UsedPtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());

arsenm wrote:

Best to just use get(Ctx, 0) 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -2922,18 +2922,19 @@ static void emitUsed(CodeGenModule , StringRef Name,
   if (List.empty())
 return;
 
+  llvm::Type *UsedPtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
+
   // Convert List to what ConstantArray needs.
   SmallVector UsedArray;
   UsedArray.resize(List.size());
   for (unsigned i = 0, e = List.size(); i != e; ++i) {
-UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-cast(&*List[i]), CGM.GlobalsInt8PtrTy);
+UsedArray[i] = llvm::ConstantExpr::getPointerCast(

arsenm wrote:

Although we should get rid of it, getPointerBitCastOrAddrSpaceCast is the 
correct helper for now. We should have a getAddrSpaceCast that doesn't throw a 
fit on same typed pointers but getPointerCast will try to insert ptrtoint etc. 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm approved this pull request.

lgtm with nit 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-06 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> If we do want addrspace(7), we'll need to expose `make.buffer.rsrc` and give 
> it a `p7` variant probably.

Yes.

We probably should expose some kind of custom type instead of directly using a 
C address_space(7) attribute 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,19 @@
+; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 
-verify-machineinstrs -o - %s 2>&1 | FileCheck %s

arsenm wrote:

This should also be repeated for all 3 intrinsics 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm requested changes to this pull request.

@jayfoad's testcase fails and the same test should be repeated for all 3 
intrinsics 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,19 @@
+; RUN: not --crash llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 
-verify-machineinstrs -o - %s 2>&1 | FileCheck %s

arsenm wrote:

This is not an IR verifier test, it is a codegen test that fails the machine 
verifier. A machine verifier would go in test/MachineVerifier, and preferably 
would be written in MIR and not rely on codegen.


I thought the point of this test was to show that the selection did not produce 
a machine verifier error after selection, so this is broken 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Use `I` to decorate imm argument for `__builtin_amdgcn_global_load_lds` (PR #94376)

2024-06-06 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/94376
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,264 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm 
-o - %s | FileCheck %s --check-prefixes=VERDE
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm 
-o - %s | FileCheck %s --check-prefixes=GFX8
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX11
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef short v2i16 __attribute__((ext_vector_type(2)));
+typedef int v2i32 __attribute__((ext_vector_type(2)));
+typedef half v2f16 __attribute__((ext_vector_type(2)));
+typedef float v2f32 __attribute__((ext_vector_type(2)));
+typedef short v4i16 __attribute__((ext_vector_type(4)));
+typedef int v4i32 __attribute__((ext_vector_type(4)));
+typedef half v4f16 __attribute__((ext_vector_type(4)));
+typedef float v4f32 __attribute__((ext_vector_type(4)));
+
+// VERDE-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// VERDE-NEXT:  entry:
+// VERDE-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// VERDE-NEXT:ret void
+//
+// GFX8-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// GFX8-NEXT:  entry:
+// GFX8-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// GFX8-NEXT:ret void
+//
+// GFX11-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// GFX11-NEXT:  entry:
+// GFX11-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// GFX11-NEXT:ret void
+//
+void test_amdgcn_raw_buffer_store_i8(char vdata, v4i32 rsrc) {

arsenm wrote:

Look at the actual ImmArg> values. It's only the cachepolicy 
argument 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,293 @@
+// REQUIRES: amdgpu-registered-target
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --function-signature
+// RUN: %clang_cc1 -cc1 -std=c23 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s 
-o - | FileCheck %s
+
+void sink_0(...);
+void sink_1(int, ...);
+void sink_2(double, int, ...);
+
+// Simple scalar values
+
+// CHECK-LABEL: define {{[^@]+}}@zero_varargs
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]]) 
local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0() #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void zero_varargs(int f0, double f1)
+{
+  sink_0();
+  sink_1(f0);
+  sink_2(f1, f0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_i32
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i32 noundef 
[[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i32(int f0, double f1, int v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_ptr
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], ptr noundef 
[[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(ptr noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], ptr 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_ptr(int f0, double f1, void* v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_f64
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], double 
noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[V0]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_f64(int f0, double f1, double v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+
+// C has various type promotion rules for variadics
+
+// CHECK-LABEL: define {{[^@]+}}@one_i8
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i8 noundef 
signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = sext i8 [[V0]] to i32
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i8(int f0, double f1, char v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_i16
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i16 noundef 
signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = sext i16 [[V0]] to i32
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i16(int f0, double f1, short v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_f32
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], float 
noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = fpext float [[V0]] to double
+// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[CONV]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], double noundef 

[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)

2024-06-06 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> @arsenm You're right about passing larger things indirectly. I'm intending to 
> land this as-is, with the types inlined, as that unblocks #93362. I'm nervous 
> that the extra pointer indirection will hit the same memory error that 
> tweaking codegen in that patch hits (it's a similar sort of pattern to the 
> top level argument passing) and wish to postpone that until there is a 
> working baseline.

I do think we need to revisit some threshold at some point. We should use byref 
for anything that's most likely to hit the stack anyway 

https://github.com/llvm/llvm-project/pull/94083
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,1037 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt ExpandVariadicsModeOption(
+DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+cl::init(ExpandVariadicsMode::Unspecified),
+cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+  "Use the implementation defaults"),
+   clEnumValN(ExpandVariadicsMode::Disable, "disable",
+  "Disable the pass entirely"),
+   clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+  "Optimise without changing ABI"),
+   clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+  "Change variadic calling convention")));
+
+bool commandLineOverride() {
+  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+class VariadicABIInfo {
+protected:
+  VariadicABIInfo() {}
+
+public:
+  static std::unique_ptr create(llvm::Triple const );
+
+  // Allow overriding whether the pass runs on a per-target basis
+  virtual bool enableForTarget() = 0;
+
+  // 

[clang] [libc] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,1037 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt ExpandVariadicsModeOption(
+DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+cl::init(ExpandVariadicsMode::Unspecified),
+cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+  "Use the implementation defaults"),
+   clEnumValN(ExpandVariadicsMode::Disable, "disable",
+  "Disable the pass entirely"),
+   clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+  "Optimise without changing ABI"),
+   clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+  "Change variadic calling convention")));
+
+bool commandLineOverride() {
+  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+class VariadicABIInfo {
+protected:
+  VariadicABIInfo() {}
+
+public:
+  static std::unique_ptr create(llvm::Triple const );

arsenm wrote:

const always to the left. East const is weird 


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-06 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,264 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm 
-o - %s | FileCheck %s --check-prefixes=VERDE
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm 
-o - %s | FileCheck %s --check-prefixes=GFX8
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 
-emit-llvm -o - %s | FileCheck %s --check-prefixes=GFX11
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+typedef short v2i16 __attribute__((ext_vector_type(2)));
+typedef int v2i32 __attribute__((ext_vector_type(2)));
+typedef half v2f16 __attribute__((ext_vector_type(2)));
+typedef float v2f32 __attribute__((ext_vector_type(2)));
+typedef short v4i16 __attribute__((ext_vector_type(4)));
+typedef int v4i32 __attribute__((ext_vector_type(4)));
+typedef half v4f16 __attribute__((ext_vector_type(4)));
+typedef float v4f32 __attribute__((ext_vector_type(4)));
+
+// VERDE-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// VERDE-NEXT:  entry:
+// VERDE-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// VERDE-NEXT:ret void
+//
+// GFX8-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// GFX8-NEXT:  entry:
+// GFX8-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// GFX8-NEXT:ret void
+//
+// GFX11-LABEL: @test_amdgcn_raw_buffer_store_i8(
+// GFX11-NEXT:  entry:
+// GFX11-NEXT:tail call void @llvm.amdgcn.raw.buffer.store.i8(i8 
[[VDATA:%.*]], <4 x i32> [[RSRC:%.*]], i32 0, i32 0, i32 0)
+// GFX11-NEXT:ret void
+//
+void test_amdgcn_raw_buffer_store_i8(char vdata, v4i32 rsrc) {

arsenm wrote:

We are trying to move to using real pointers (i.e. use 
llvm.amdgcn.raw.ptr.buffer.store instead of llvm.amdgcn.raw.buffer.store) 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.buffer.store` (PR #94576)

2024-06-06 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Is there really a good use case for this? Can you use regular stores to 
> addrspace(7) instead? @krzysz00

I see these regularly used via inline asm in various ML code. We need to expose 
these in some way to stop people from doing that 

> 
> Also, do you really need a separate builtin for every legal type, or is there 
> some way they can be type-overloaded?

Yes, I imagined we would handle images similar to the elementwise intrinsics. 
However, I don't think that approach works for loads. If we have to have 
overloads for loads, we probably should mirror it for stores.

I think it makes more sense to solve the issue for the load case before the 
stores. They're a bit more complicated because you have the sign vs. zero 
extended cases to consider, and the overload would be on the return type 

https://github.com/llvm/llvm-project/pull/94576
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang][AMDGPU] Use `I` to decorate imm argument for `__builtin_amdgcn_global_load_lds` (PR #94376)

2024-06-04 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm commented:

Missing non-constant tests for each parameter?

https://github.com/llvm/llvm-project/pull/94376
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)

2024-06-01 Thread Matt Arsenault via cfe-commits


@@ -197,12 +202,20 @@ ABIArgInfo 
AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
 }
 
-ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
+ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
unsigned ) const {
   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
 
   Ty = useFirstFieldIfTransparentUnion(Ty);
 
+  if (Variadic) {

arsenm wrote:

The bigger concern is using giant aggregate types is not great IR, and not all 
that well supported. a 65k element array will crash in SelectionDAG for example 

https://github.com/llvm/llvm-project/pull/94083
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)

2024-06-01 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,293 @@
+// REQUIRES: amdgpu-registered-target
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --function-signature
+// RUN: %clang_cc1 -cc1 -std=c23 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s 
-o - | FileCheck %s
+
+void sink_0(...);
+void sink_1(int, ...);
+void sink_2(double, int, ...);
+
+// Simple scalar values
+
+// CHECK-LABEL: define {{[^@]+}}@zero_varargs
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]]) 
local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0() #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void zero_varargs(int f0, double f1)
+{
+  sink_0();
+  sink_1(f0);
+  sink_2(f1, f0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_i32
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i32 noundef 
[[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i32(int f0, double f1, int v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_ptr
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], ptr noundef 
[[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(ptr noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], ptr 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], ptr noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_ptr(int f0, double f1, void* v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_f64
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], double 
noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[V0]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double 
noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], double noundef [[V0]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_f64(int f0, double f1, double v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+
+// C has various type promotion rules for variadics
+
+// CHECK-LABEL: define {{[^@]+}}@one_i8
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i8 noundef 
signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = sext i8 [[V0]] to i32
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i8(int f0, double f1, char v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_i16
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], i16 noundef 
signext [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = sext i16 [[V0]] to i32
+// CHECK-NEXT:tail call void (...) @sink_0(i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], i32 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], i32 noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:ret void
+//
+void one_i16(int f0, double f1, short v0)
+{
+  sink_0(v0);
+  sink_1(f0, v0);
+  sink_2(f1, f0, v0);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@one_f32
+// CHECK-SAME: (i32 noundef [[F0:%.*]], double noundef [[F1:%.*]], float 
noundef [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[CONV:%.*]] = fpext float [[V0]] to double
+// CHECK-NEXT:tail call void (...) @sink_0(double noundef [[CONV]]) 
#[[ATTR2]]
+// CHECK-NEXT:tail call void (i32, ...) @sink_1(i32 noundef [[F0]], double 
noundef [[CONV]]) #[[ATTR2]]
+// CHECK-NEXT:tail call void (double, i32, ...) @sink_2(double noundef 
[[F1]], i32 noundef [[F0]], double noundef 

[clang] [amdgpu] Pass variadic arguments without splitting (PR #94083)

2024-06-01 Thread Matt Arsenault via cfe-commits


@@ -197,12 +202,20 @@ ABIArgInfo 
AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
 }
 
-ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
+ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
unsigned ) const {
   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
 
   Ty = useFirstFieldIfTransparentUnion(Ty);
 
+  if (Variadic) {

arsenm wrote:

Wouldn't we still want to follow the isAggregateTypeForABI rules? Large structs 
should still go through byref? 

https://github.com/llvm/llvm-project/pull/94083
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -32,27 +32,29 @@ class StoreInst;
 
 /// These are the kinds of recurrences that we support.
 enum class RecurKind {
-  None, ///< Not a recurrence.
-  Add,  ///< Sum of integers.
-  Mul,  ///< Product of integers.
-  Or,   ///< Bitwise or logical OR of integers.
-  And,  ///< Bitwise or logical AND of integers.
-  Xor,  ///< Bitwise or logical XOR of integers.
-  SMin, ///< Signed integer min implemented in terms of select(cmp()).
-  SMax, ///< Signed integer max implemented in terms of select(cmp()).
-  UMin, ///< Unsigned integer min implemented in terms of select(cmp()).
-  UMax, ///< Unsigned integer max implemented in terms of select(cmp()).
-  FAdd, ///< Sum of floats.
-  FMul, ///< Product of floats.
-  FMin, ///< FP min implemented in terms of select(cmp()).
-  FMax, ///< FP max implemented in terms of select(cmp()).
-  FMinimum, ///< FP min with llvm.minimum semantics
-  FMaximum, ///< FP max with llvm.maximum semantics
-  FMulAdd,  ///< Sum of float products with llvm.fmuladd(a * b + sum).
-  IAnyOf,   ///< Any_of reduction with select(icmp(),x,y) where one of (x,y) is
-///< loop invariant, and both x and y are integer type.
-  FAnyOf///< Any_of reduction with select(fcmp(),x,y) where one of (x,y) is
-///< loop invariant, and both x and y are integer type.
+  None,///< Not a recurrence.
+  Add, ///< Sum of integers.
+  Mul, ///< Product of integers.
+  Or,  ///< Bitwise or logical OR of integers.
+  And, ///< Bitwise or logical AND of integers.
+  Xor, ///< Bitwise or logical XOR of integers.
+  SMin,///< Signed integer min implemented in terms of select(cmp()).
+  SMax,///< Signed integer max implemented in terms of select(cmp()).
+  UMin,///< Unsigned integer min implemented in terms of select(cmp()).
+  UMax,///< Unsigned integer max implemented in terms of select(cmp()).
+  FAdd,///< Sum of floats.
+  FMul,///< Product of floats.
+  FMin,///< FP min implemented in terms of select(cmp()).
+  FMax,///< FP max implemented in terms of select(cmp()).
+  FMinimum,///< FP min with llvm.minimum semantics
+  FMaximum,///< FP max with llvm.maximum semantics
+  FMinimumnum, ///< FP min with llvm.minimumnum semantics
+  FMaximumnum, ///< FP max with llvm.maximumnum semantics

arsenm wrote:

Not sure this is tested, but updating each optimization should be split into a 
separate change 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-31 Thread Matt Arsenault via cfe-commits

arsenm wrote:

You should add the mentioned convergence-tokens.ll test function 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)

2024-05-31 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Perhaps an alternative is to tweak LangRef wording to say that that these are 
> always emitted as unqualified ptrs, and that their ephemeral nature implies 
> that their AS is meaningless?

I think this is the correct way to handle it. Also we'll need a few 
stripPointerCasts added somewhere 




https://github.com/llvm/llvm-project/pull/93914
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,1023 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt ExpandVariadicsModeOption(
+DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+cl::init(ExpandVariadicsMode::Unspecified),
+cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+  "Use the implementation defaults"),
+   clEnumValN(ExpandVariadicsMode::Disable, "disable",
+  "Disable the pass entirely"),
+   clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+  "Optimise without changing ABI"),
+   clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+  "Change variadic calling convention")));
+
+bool commandLineOverride() {
+  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+class VariadicABIInfo {
+protected:
+  VariadicABIInfo() {}
+
+public:
+  static std::unique_ptr create(llvm::Triple const );
+
+  // Allow overriding whether the pass runs on a per-target basis
+  virtual bool enableForTarget() = 0;
+
+  // 

[clang] [llvm] [AMDGPU] Implement variadic functions by IR lowering (PR #93362)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,1023 @@
+//===-- ExpandVariadicsPass.cpp *- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a variadic function into a va_list
+// and fix up the call sites. The majority of the pass is target independent.
+// The exceptions are the va_list type itself and the rules for where to store
+// variables in memory such that va_arg can iterate over them given a va_list.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. That packing is
+// exactly what is done by va_start. Further, the transform from ... to va_list
+// replaced va_start with an operation to copy a va_list from the new argument,
+// which is exactly a va_copy. This is useful for reducing target-dependence.
+//
+// A va_list instance is a forward iterator, where the primary operation va_arg
+// is dereference-then-increment. This interface forces significant convergent
+// evolution between target specific implementations. The variation in runtime
+// data layout is limited to that representable by the iterator, parameterised
+// by the type passed to the va_arg instruction.
+//
+// Therefore the majority of the target specific subtlety is packing arguments
+// into a stack allocated buffer such that a va_list can be initialised with it
+// and the va_arg expansion for the target will find the arguments at runtime.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular 
target.
+//
+// There is one "clever" invariant in use. va_start intrinsics that are not
+// within a varidic functions are an error in the IR verifier. When this
+// transform moves blocks from a variadic function into a fixed arity one, it
+// moves va_start intrinsics along with everything else. That means that the
+// va_start intrinsics that need to be rewritten to use the trailing argument
+// are exactly those that are in non-variadic functions so no further state
+// is needed to distinguish those that need to be rewritten.
+//
+//===--===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt ExpandVariadicsModeOption(
+DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+cl::init(ExpandVariadicsMode::Unspecified),
+cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
+  "Use the implementation defaults"),
+   clEnumValN(ExpandVariadicsMode::Disable, "disable",
+  "Disable the pass entirely"),
+   clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
+  "Optimise without changing ABI"),
+   clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
+  "Change variadic calling convention")));
+
+bool commandLineOverride() {
+  return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified;
+}
+
+// Instances of this class encapsulate the target-dependant behaviour as a
+// function of triple. Implementing a new ABI is adding a case to the switch
+// in create(llvm::Triple) at the end of this file.
+class VariadicABIInfo {
+protected:
+  VariadicABIInfo() {}
+
+public:
+  static std::unique_ptr create(llvm::Triple const );
+
+  // Allow overriding whether the pass runs on a per-target basis
+  virtual bool enableForTarget() = 0;
+
+  // 

[clang] [llvm] [clang][CodeGen] Global constructors/destructors are globals (PR #93914)

2024-05-31 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> The third argument here is like for llvm.used, it's a way to associate the 
> entry with a global or function. If the corresponding global or function is 
> omitted from the output then the entry will be removed. It isn't used for 
> anything at run time. So I think there should be a consistent story between 
> llvm.used and llvm.global_[cd]tors.

I briefly skimmed the codegen and as far as I could tell an addrspacecast 
constant expression in global_ctors won't do the right thing, we probably 
should fix that too 

https://github.com/llvm/llvm-project/pull/93914
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][CodeGen] `used` globals && the payloads for global ctors & dtors are globals (PR #93601)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -2928,12 +2928,13 @@ static void emitUsed(CodeGenModule , StringRef Name,
   for (unsigned i = 0, e = List.size(); i != e; ++i) {
 UsedArray[i] =
 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-cast(&*List[i]), CGM.Int8PtrTy);

arsenm wrote:

You aren't emitting a real global, it's this fake special case IR construct. It 
doesn't fit into the boxes provided by the target global/program/alloca address 
spaces 

https://github.com/llvm/llvm-project/pull/93601
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)

2024-05-31 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> I think the comments here are fed into #93362 successfully, will go through 
> the list again to check.

So #93362 is the replacement, and not the sequential next piece? Can we close 
this one then? 

https://github.com/llvm/llvm-project/pull/89007
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -5005,8 +5007,11 @@ void computeKnownFPClass(const Value *V, const APInt 
,
   // If either operand is not NaN, the result is not NaN.
   if (NeverNaN && (IID == Intrinsic::minnum || IID == Intrinsic::maxnum))
 Known.knownNot(fcNan);
+  if (NeverNaN &&
+  (IID == Intrinsic::minimumnum || IID == Intrinsic::maximumnum))
+Known.knownNot(fcNan);
 
-  if (IID == Intrinsic::maxnum) {
+  if (IID == Intrinsic::maxnum || IID == Intrinsic::maximumnum) {

arsenm wrote:

Best to keep the value tracking handling in a separate PR. This is missing test 
coverage 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -16049,6 +16094,84 @@ of the two arguments. -0.0 is considered to be less 
than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
 IEEE 754-2019.
 
+.. _i_minimumnum:
+
+'``llvm.minimumnum.*``' Intrinsic
+^
+
+Syntax:
+"""
+
+This is an overloaded intrinsic. You can use ``llvm.minimumnum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+  declare float @llvm.minimumnum.f32(float %Val0, float %Val1)
+  declare double@llvm.minimumnum.f64(double %Val0, double %Val1)
+  declare x86_fp80  @llvm.minimumnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+  declare fp128 @llvm.minimumnum.f128(fp128 %Val0, fp128 %Val1)
+  declare ppc_fp128 @llvm.minimumnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 
%Val1)
+
+Overview:
+"
+
+The '``llvm.minimumnum.*``' intrinsics return the minimum of the two
+arguments, not propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""
+If both operands are NaNs, returns qNaN. Otherwise returns the lesser
+of the two arguments. -0.0 is considered to be less than +0.0 for this
+intrinsic. Note that these are the semantics specified in IEEE 754-2019.

arsenm wrote:

Needs to spell out the signaling nan behavior. If we're fixing minnum's snan 
behavior to match IEEE, this is identical except with the stronger guarantee 
for signed zero ordering. The documentation should also explicitly state this 
is the only difference, to help reduce confusion. Alternatively, we could add 
an immediate bool parameter to minnum/maxnum for whether the ordering of 0 is 
guaranteed 

I hate the naming mess we've ended up with here, but I guess C23 has damned us. 
If you're going to match the C23 names, this should be `llvm.minimum.num` with 
an extra _ 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits


@@ -3636,6 +3648,22 @@ def Fmin : FPMathTemplate, LibBuiltin<"math.h"> {
   let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
 }
 
+def FmaximumNum : FPMathTemplate, LibBuiltin<"math.h"> {

arsenm wrote:

I'd prefer to split the clang changes into a separate change 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] Intrinsic: introduce minimumnum and maximumnum (PR #93841)

2024-05-31 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm commented:

> 3. PowerPC: has some interaction with the behavior of `minnum/maxnum`: need 
> define `fcanonicalize`.

AMDGPU has the same handling. This is to break the signaling nan handling from 
IEEE to the broken old glibc libm behavior. If we fix the definition to match 
IEEE, this is no longer necessary and the operation is directly legal 

https://github.com/llvm/llvm-project/pull/93841
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-31 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> Does this need IR autoupgrade?

This type of auto upgrade is free, it just happens 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-31 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-30 Thread Matt Arsenault via cfe-commits


@@ -5461,8 +5461,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper 
,
 
   SmallVector PartialRes;
   unsigned NumParts = Size / 32;
-  MachineInstrBuilder Src0Parts, Src2Parts;
-  Src0Parts = B.buildUnmerge(PartialResTy, Src0);
+  MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0), 
Src2Parts;

arsenm wrote:

Don't also declare Src2Parts on the same line 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-30 Thread Matt Arsenault via cfe-commits


@@ -1208,7 +1225,7 @@ The AMDGPU backend implements the following LLVM IR 
intrinsics.
the output.
 
   llvm.amdgcn.sdot2Provides direct access to 
v_dot2_i32_i16 across targets which
-   support such instructions. 
This performs signed dot product
+   upport such instructions. 
This performs signed dot product

arsenm wrote:

Stray typo introduced 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

2024-05-30 Thread Matt Arsenault via cfe-commits


@@ -5387,6 +5387,98 @@ bool 
AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper ,
   return true;
 }
 
+// TODO: Fix pointer type handling
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper ,
+ MachineInstr ,
+ Intrinsic::ID IID) const {
+
+  MachineIRBuilder  = Helper.MIRBuilder;
+  MachineRegisterInfo  = *B.getMRI();
+
+  auto createLaneOp = [, ](Register Src0, Register Src1, Register Src2,
+ LLT VT) -> Register {
+auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
+switch (IID) {
+case Intrinsic::amdgcn_readfirstlane:
+  return LaneOp.getReg(0);
+case Intrinsic::amdgcn_readlane:
+  return LaneOp.addUse(Src1).getReg(0);
+case Intrinsic::amdgcn_writelane:
+  return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+default:
+  llvm_unreachable("unhandled lane op");
+}
+  };
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+  Register Src1, Src2;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) 
{
+Src1 = MI.getOperand(3).getReg();
+if (IID == Intrinsic::amdgcn_writelane) {
+  Src2 = MI.getOperand(4).getReg();
+}
+  }
+
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
+
+  if (Size == 32) {
+// Already legal
+return true;
+  }
+
+  if (Size < 32) {
+Src0 = B.buildAnyExt(S32, Src0).getReg(0);
+if (Src2.isValid())
+  Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
+
+Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
+B.buildTrunc(DstReg, LaneOpDst);
+
+MI.eraseFromParent();
+return true;
+  }
+
+  if (Size % 32 != 0)
+return false;
+
+  LLT PartialResTy = S32;
+  if (Ty.isVector()) {
+LLT EltTy = Ty.getElementType();
+switch (EltTy.getSizeInBits()) {
+case 16:
+  PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
+  break;
+case 32:
+  PartialResTy = EltTy;
+  break;
+default:
+  // Handle all other cases via S32 pieces;
+  break;
+}
+  }
+
+  SmallVector PartialRes;
+  unsigned NumParts = Size / 32;
+  MachineInstrBuilder Src0Parts, Src2Parts;
+  Src0Parts = B.buildUnmerge(PartialResTy, Src0);

arsenm wrote:

Fold declare + initialize 

https://github.com/llvm/llvm-project/pull/89217
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


  1   2   3   4   5   6   7   8   9   10   >