[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-solaris11-sparcv9` 
running on `solaris11-sparcv9` while building `clang` at step 5 "ninja check 1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/13/builds/3473


Here is the relevant piece of the build log for the reference

```
Step 5 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/bin/clang
 -cc1 -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/bin/FileCheck
 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/bin/clang
 -cc1 -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/stage1/bin/FileCheck
 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/opt/llvm-buildbot/home/solaris11-sparcv9/clang-solaris11-sparcv9/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-hexagon-elf` running 
on `hexagon-build-03` while building `clang` at step 5 "ninja check 1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/40/builds/2750


Here is the relevant piece of the build log for the reference

```
Step 5 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/bin/clang 
-cc1 -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/bin/FileCheck
 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/bin/clang 
-cc1 -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/stage1/bin/FileCheck
 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/local/mnt/workspace/bots/hexagon-build-03/clang-hexagon-elf/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `arc-builder` running on 
`arc-worker` while building `clang` at step 6 
"test-build-unified-tree-check-all".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/3/builds/7462


Here is the relevant piece of the build log for the reference

```
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /buildbot/worker/arc-folder/build/bin/clang -cc1 
-internal-isystem /buildbot/worker/arc-folder/build/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/Inputs/include
-internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/../../lib/Headers/  
  -fcuda-is-device -triple nvptx64 -emit-llvm 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c -o 
-  | /buildbot/worker/arc-folder/build/bin/FileCheck 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c 
--check-prefix=CUDA
+ /buildbot/worker/arc-folder/build/bin/FileCheck 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c 
--check-prefix=CUDA
+ /buildbot/worker/arc-folder/build/bin/clang -cc1 -internal-isystem 
/buildbot/worker/arc-folder/build/lib/clang/20/include -nostdsysteminc 
-internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/Inputs/include 
-internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/../../lib/Headers/ 
-fcuda-is-device -triple nvptx64 -emit-llvm 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c -o 
-
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c' 
   2: source_filename = 
"/buildbot/worker/arc-folder/llvm-project/clang/test/Headers/gpuintrin_lang.c" 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `llvm-clang-x86_64-darwin` 
running on `doug-worker-3` while building `clang` at step 6 
"test-build-unified-tree-check-all".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/23/builds/4705


Here is the relevant piece of the build log for the reference

```
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/bin/clang 
-cc1 -internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/Inputs/include
-internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -  | /Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/bin/FileCheck 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/bin/clang -cc1 
-internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/Inputs/include
 -internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -
+ /Volumes/RAMDisk/buildbot-root/x86_64-darwin/build/bin/FileCheck 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/Volumes/RAMDisk/buildbot-root/x86_64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-m68k-linux-cross` 
running on `suse-gary-m68k-cross` while building `clang` at step 5 "ninja check 
1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/27/builds/1868


Here is the relevant piece of the build log for the reference

```
Step 5 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/bin/clang
 -cc1 -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/bin/FileCheck
 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/bin/clang
 -cc1 -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/stage1/bin/FileCheck
 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/var/lib/buildbot/workers/suse-gary-m68k-cross/clang-m68k-linux-cross/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder 
`clang-cmake-x86_64-avx512-linux` running on `avx512-intel64` while building 
`clang` at step 7 "ninja check 1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/133/builds/6621


Here is the relevant piece of the build log for the reference

```
Step 7 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/bin/clang
 -cc1 -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/bin/FileCheck
 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/bin/clang
 -cc1 -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/stage1/bin/FileCheck
 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/localdisk2/buildbot/llvm-worker/clang-cmake-x86_64-avx512-linux/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-aarch64-quick` 
running on `linaro-clang-aarch64-quick` while building `clang` at step 5 "ninja 
check 1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/65/builds/7589


Here is the relevant piece of the build log for the reference

```
Step 5 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/bin/clang 
-cc1 -internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/bin/FileCheck 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/bin/clang -cc1 
-internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/bin/FileCheck 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-armv8-quick` running 
on `linaro-clang-armv8-quick` while building `clang` at step 5 "ninja check 1".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/154/builds/7298


Here is the relevant piece of the build log for the reference

```
Step 5 (ninja check 1) failure: stage 1 checked (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/clang 
-cc1 -internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/stage1/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/Inputs/include
-internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -  | /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/FileCheck 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/clang -cc1 
-internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/stage1/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/Inputs/include
 -internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 -o -
+ /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/FileCheck 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

Seems there's something *slightly* different from the autogenerated IR for the 
language test. I'll see if I can fix it.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `llvm-clang-x86_64-sie-win` 
running on `sie-win-worker` while building `clang` at step 7 
"test-build-unified-tree-check-all".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/46/builds/7704


Here is the relevant piece of the build log for the reference

```
Step 7 (test-build-unified-tree-check-all) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
z:\b\llvm-clang-x86_64-sie-win\build\bin\clang.exe -cc1 -internal-isystem 
Z:\b\llvm-clang-x86_64-sie-win\build\lib\clang\20\include -nostdsysteminc 
-internal-isystem 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/Inputs/include   
 -internal-isystem 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c 
-o -  | z:\b\llvm-clang-x86_64-sie-win\build\bin\filecheck.exe 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c 
--check-prefix=CUDA
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\clang.exe' -cc1 
-internal-isystem 'Z:\b\llvm-clang-x86_64-sie-win\build\lib\clang\20\include' 
-nostdsysteminc -internal-isystem 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/Inputs/include' 
-internal-isystem 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/../../lib/Headers/cuda_wrappers'
 -internal-isystem 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers/../../lib/Headers/'
 -fcuda-is-device -triple nvptx64 -emit-llvm 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c'
 -o -
# executed command: 'z:\b\llvm-clang-x86_64-sie-win\build\bin\filecheck.exe' 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c'
 --check-prefix=CUDA
# .---command stderr
# | 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
# | // CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
# |   ^
# | :8:7: note: scanning from 
here
# | entry:
# |   ^
# | :9:2: note: possible 
intended match here
# |  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
# |  ^
# | 
# | Input file: 
# | Check file: 
Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<
# | 1: ; ModuleID = 
'Z:\b\llvm-clang-x86_64-sie-win\llvm-project\clang\test\Headers\gpuintrin_lang.c'
 
# | 2: source_filename = 
"Z:\\b\\llvm-clang-x86_64-sie-win\\llvm-project\\clang\\test\\Headers\\gpuintrin_lang.c"
 
# | 3: target datalayout = 
"e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
# | 4: target triple = "nvptx64" 
# | 5:  
# | 6: ; Function Attrs: convergent 
noinline nounwind optnone 
# | 7: define dso_local i32 @foo() #0 
{ 
# | label:36'0 ^~
# | label:36'1 ^~
# | same:37'0^~
# | same:37'1   ^
captured var "ATTR0"
# | 8: 
entry: 
# | next:38'0  ^~
# | next:38'1  ^~  captured var "ENTRY"
# | next:39'0X error: no match found
# | 9:  %0 = call i32 
@llvm.nvvm.read.ptx.sreg.tid.x() 
# | next:39'0  
# | next:39'1   ? 
  possible intended match
# |    10:  ret i32 %0 
# | next:39'0  
# |    11: } 
# | next:39'0  ~~
# |    12:  
...

```



https://github.com/llvm/llvm-project/pull/1101

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `llvm-clang-aarch64-darwin` 
running on `doug-worker-5` while building `clang` at step 6 
"test-build-unified-tree-check-all".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/190/builds/9263


Here is the relevant piece of the build log for the reference

```
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/clang 
-cc1 -internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/build/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/Inputs/include
-internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -  | /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/clang -cc1 
-internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/build/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/Inputs/include
 -internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
1: ; ModuleID = 
'/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c'
 
2: source_filename = 
"/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Headers/gpuintrin_lang.c"
 
3: target datalayout = 
"e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
4: target triple = "nvptx64" 
5:  
6: ; Function Attrs: convergent noinline 
nounwind optnone 
7: define dso_local i32 @foo() #0 
{ 
label:36'0 ^~
label:36'1 ^~
same:37'0^~
same:37'1   ^captured var 
"ATTR0"
8: entry: 
next:38'0  ^~
next:38'1  ^~  captured var "ENTRY"
next:39'0X error: no match found
9:  %0 = call i32 
@llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0  
next:39'1   ?   
possible intended match
   10:  ret i32 %0 
next:39'0  
   11: } 
next:39'0  ~~
   12:  
next:39'0  ~
   13: ; Function Attrs: nocallback 
nofree nosync nounwind speculatable willreturn memory(none) 
...

```



https://github.com/llvm/llvm-project/pull/110179

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder 
`openmp-offload-sles-build-only` running on `rocm-worker-hw-04-sles` while 
building `clang` at step 6 "Add check check-clang".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/140/builds/10631


Here is the relevant piece of the build log for the reference

```
Step 6 (Add check check-clang) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/bin/clang -cc1 
-internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/Inputs/include
-internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/bin/FileCheck 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/bin/FileCheck 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/bin/clang 
-cc1 -internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.build/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/Inputs/include
 -internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c
 -o -
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/home/botworker/bbot/builds/openmp-offload-sles-build/llvm.src/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
next:39'0 
  11: } 
next:39'0 ~~
  12:  
next:39'0 ~
  13: ; Function Attrs: nocallback nofree nosync nounwind speculatable 
willreturn memory(none) 
next:39'0 
~
  14: declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 
next:39'0 
   .
   .
   .
...

```



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `clang-ve-ninja` running on 
`hpce-ve-main` while building `clang` at step 4 "annotate".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/12/builds/9354


Here is the relevant piece of the build log for the reference

```
Step 4 (annotate) failure: 'python 
../llvm-zorg/zorg/buildbot/builders/annotated/ve-linux.py ...' (failure)
...
[295/301] Linking CXX executable tools/clang/unittests/Driver/ClangDriverTests
[296/301] Linking CXX executable tools/clang/unittests/CodeGen/ClangCodeGenTests
[297/301] Linking CXX executable tools/clang/unittests/Tooling/ToolingTests
[298/301] Linking CXX executable tools/clang/unittests/Frontend/FrontendTests
[299/301] Linking CXX executable 
tools/clang/unittests/Interpreter/ExceptionTests/ClangReplInterpreterExceptionTests
[300/301] Linking CXX executable 
tools/clang/unittests/Interpreter/ClangReplInterpreterTests
[300/301] Running the Clang regression tests
-- Testing: 21288 tests, 48 workers --
llvm-lit: 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/llvm/utils/lit/lit/llvm/config.py:506:
 note: using clang: 
/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/bin/clang
Testing:  0.. 10.. 20.. 30.. 40.. 50.
FAIL: Clang :: Headers/gpuintrin_lang.c (12017 of 21288)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/bin/clang -cc1 
-internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/Inputs/include
-internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/bin/FileCheck 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ /scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/bin/clang -cc1 
-internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/lib/clang/20/include 
-nostdsysteminc -internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/Inputs/include
 -internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -
+ /scratch/buildbot/bothome/clang-ve-ninja/build/build_llvm/bin/FileCheck 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
   1: ; ModuleID = 
'/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c'
 
   2: source_filename = 
"/scratch/buildbot/bothome/clang-ve-ninja/llvm-project/clang/test/Headers/gpuintrin_lang.c"
 
   3: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
   4: target triple = "nvptx64" 
   5:  
   6: ; Function Attrs: convergent noinline nounwind optnone 
   7: define dso_local i32 @foo() #0 { 
   8: entry: 
next:39'0   X error: no match found
   9:  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0 
next:39'1  ?   possible 
intended match
  10:  ret i32 %0 
Step 8 (check-llvm) failure: check-llvm (failure)
...
[295/301] Linking CXX executable tools/clang/unittests/Driver/ClangDriverTests
[296/301] Linking CXX executable tools/clang/unittests/CodeGen/ClangCodeGenTests
[297/301] Linking CXX executable tools/clang/unittests/Tooling/ToolingTests
[298/301] Linking CXX executable tools/clang/unittests/Frontend/FrontendTests
[299/301] Linking CXX ex

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder 
`llvm-clang-x86_64-sie-ubuntu-fast` running on `sie-linux-worker` while 
building `clang` at step 6 "test-build-unified-tree-check-all".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/144/builds/11351


Here is the relevant piece of the build log for the reference

```
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
 TEST 'Clang :: Headers/gpuintrin_lang.c' FAILED 

Exit Code: 1

Command Output (stderr):
--
RUN: at line 2: 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/bin/clang 
-cc1 -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/Inputs/include
-internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
-internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/../../lib/Headers/
-fcuda-is-device -triple nvptx64 -emit-llvm 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -  | 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/bin/FileCheck
 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
+ 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/bin/clang 
-cc1 -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/lib/clang/20/include
 -nostdsysteminc -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/Inputs/include
 -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/../../lib/Headers/cuda_wrappers
 -internal-isystem 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/../../lib/Headers/
 -fcuda-is-device -triple nvptx64 -emit-llvm 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c
 -o -
+ 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/build/bin/FileCheck
 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c
 --check-prefix=CUDA
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c:39:15:
 error: CUDA-NEXT: expected string not found in input
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 
@llvm.nvvm.read.ptx.sreg.tid.x()
  ^
:8:7: note: scanning from here
entry:
  ^
:9:2: note: possible intended match here
 %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ^

Input file: 
Check file: 
/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c

-dump-input=help explains the following input dump.

Input was:
<<
1: ; ModuleID = 
'/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c'
 
2: source_filename = 
"/home/buildbot/buildbot-root/llvm-clang-x86_64-sie-ubuntu-fast/llvm-project/clang/test/Headers/gpuintrin_lang.c"
 
3: target datalayout = 
"e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 
4: target triple = "nvptx64" 
5:  
6: ; Function Attrs: convergent noinline 
nounwind optnone 
7: define dso_local i32 @foo() #0 
{ 
label:36'0 ^~
label:36'1 ^~
same:37'0^~
same:37'1   ^captured var 
"ATTR0"
8: entry: 
next:38'0  ^~
next:38'1  ^~  captured var "ENTRY"
next:39'0X error: no match found
9:  %0 = call i32 
@llvm.nvvm.read.ptx.sreg.tid.x() 
next:39'0  
next:39'1   ?   
possible intended match
   10:  ret i32 %0 
next:39'0  
[0

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-11 Thread via cfe-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff a9cd941f392dbf99ddfcde9721bd5c485823bdf0 
35f20bbe5ce45194dff68c52018cb3cf04b533f7 --extensions h,c -- 
clang/lib/Headers/amdgpuintrin.h clang/lib/Headers/gpuintrin.h 
clang/lib/Headers/nvptxintrin.h clang/test/Headers/gpuintrin.c 
clang/test/Headers/gpuintrin_lang.c
``





View the diff from clang-format here.


``diff
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index e66c13bb1c..c585f71f60 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -191,7 +191,7 @@ _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void 
__gpu_exit(void) {
 // Suspend the thread briefly to assist the scheduler during busy loops.
 _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
   if (__nvvm_reflect("__CUDA_ARCH") >= 700)
-LIBC_INLINE_ASM("nanosleep.u32 64;" :: : "memory");
+LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
 }
 
 _Pragma("omp end declare variant");

``




https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-08 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-08 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,191 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+#include 
+
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the number of blocks in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_blocks_x();
+  case 1:
+return __gpu_num_blocks_y();
+  case 2:
+return __gpu_num_blocks_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of block id in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_block_id_x();
+  case 1:
+return __gpu_block_id_y();
+  case 2:
+return __gpu_block_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of threads in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_threads_x();
+  case 1:
+return __gpu_num_threads_y();
+  case 2:
+return __gpu_num_threads_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the thread id in the requested dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_thread_id_x();
+  case 1:
+return __gpu_thread_id_y();
+  case 2:
+return __gpu_thread_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_FN_ATTRS static __inline__ bool
+__gpu_is_first_in_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the first floating point value from the active lanes.
+_DEFAULT_FN_ATTRS static __inline__ float
+__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
+  return __builtin_bit_cast(
+  float, __gpu_read_first_lane_u32(__lane_mask,
+   __builtin_bit_cast(uint32_t, __x)));
+}
+
+// Gets the first floating point value from the active lanes.

jdoerfert wrote:

Copy and paste errors, also below.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-08 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert edited 
https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-07 Thread Aaron Ballman via cfe-commits

https://github.com/AaronBallman approved this pull request.

LGTM!

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})

jhuber6 wrote:

Done

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 

jhuber6 wrote:

Done, and a test.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})

jhuber6 wrote:

I could probably use `_Pragma("..")` instead.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 

jhuber6 wrote:

Alright, guess I could make it work.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})

AaronBallman wrote:

Is there a way we can guard against that? Does OpenMP allow for things like 
`#pragma __omp__ __begin__ ... `?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 

AaronBallman wrote:

Hrmmm. My gut feeling is: this should be clean for C89 and just work because 
there's no reason for it not to. Alternatively, if it shouldn't work in C89, 
perhaps we should explicitly disallow that mode?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,119 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the number of blocks in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_blocks_x();
+  case 1:
+return __gpu_num_blocks_y();
+  case 2:
+return __gpu_num_blocks_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of block id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_block_id_x();
+  case 1:
+return __gpu_block_id_y();
+  case 2:
+return __gpu_block_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of threads in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_threads_x();
+  case 1:
+return __gpu_num_threads_y();
+  case 2:
+return __gpu_num_threads_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the thread id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_thread_id_x();
+  case 1:
+return __gpu_thread_id_y();
+  case 2:
+return __gpu_thread_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_FN_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_FN_ATTRS static inline bool
+__gpu_is_first_in_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+_DEFAULT_FN_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
+uint32_t index = step + __gpu_lane_id();
+x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return __gpu_broadcast_u32(__lane_mask, x);
+}
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+_DEFAULT_FN_ATTRS static inline uint32_t
+__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
+uint32_t index = __gpu_lane_id() - step;
+uint32_t bitmask = __gpu_lane_id() >= step;
+x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return x;
+}
+
+#undef _DEFAULT_FN_ATTRS

jhuber6 wrote:

I kind of included them transitively, but I think it'd be better to just 
duplicate it so people can just include the NVPTX specific ones manually.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 

jhuber6 wrote:

Good point, honestly I think it's safe to ignore C89 since C99 has been the 
default for awhile and I don't think we need to worry about that with GPU stuff.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})

jhuber6 wrote:

These should be ignored outside of OpenMP mode, but if someone does like 
`#define nohost "foo"` it will definitely break.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,119 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the number of blocks in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_blocks_x();
+  case 1:
+return __gpu_num_blocks_y();
+  case 2:
+return __gpu_num_blocks_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of block id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_block_id_x();
+  case 1:
+return __gpu_block_id_y();
+  case 2:
+return __gpu_block_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the number of threads in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_num_threads_x();
+  case 1:
+return __gpu_num_threads_y();
+  case 2:
+return __gpu_num_threads_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Returns the thread id in the requested dimension.
+_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
+  switch (__dim) {
+  case 0:
+return __gpu_thread_id_x();
+  case 1:
+return __gpu_thread_id_y();
+  case 2:
+return __gpu_thread_id_z();
+  default:
+__builtin_unreachable();
+  }
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_FN_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_FN_ATTRS static inline bool
+__gpu_is_first_in_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+_DEFAULT_FN_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
+uint32_t index = step + __gpu_lane_id();
+x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return __gpu_broadcast_u32(__lane_mask, x);
+}
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+_DEFAULT_FN_ATTRS static inline uint32_t
+__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
+uint32_t index = __gpu_lane_id() - step;
+uint32_t bitmask = __gpu_lane_id() >= step;
+x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return x;
+}
+
+#undef _DEFAULT_FN_ATTRS

AaronBallman wrote:

Should this also be in the other .h file, or should we remove it here?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,157 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX"
+#endif
+
+#include 

AaronBallman wrote:

Same here as above.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 

AaronBallman wrote:

I think it's a bit surprising that including this header will transitively 
define `bool` in C modes; I would expect we'd use `_Bool` in this header file.

As for C89 modes, I think this will still work okay, but you should have 
explicit test coverage for C89.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,155 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if !defined(_DEFAULT_FN_ATTRS)
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_FN_ATTRS __attribute__((device))
+#else
+#define _DEFAULT_FN_ATTRS
+#endif
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})

AaronBallman wrote:

How much do we have to worry about users defining any of these as macros prior 
to including the system header?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.

jhuber6 wrote:

Thanks for the input, is there anything else about this approach that you have 
concerns about? I remember in my original RFC someone suggested waiting for you 
to chime in.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Aaron Ballman via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.

AaronBallman wrote:

We're consistently inconsistent, so it doesn't much matter. (Personally, I 
think all of our compiler-provided system headers should be using `/* */` 
comments because they're available in C language modes and it seems silly to me 
that we'd want the overhead of issuing `-Wcomment` diagnostics that get 
suppressed by the diagnostics engine because they're in a system header. But 
that's just me being pedantic. Well, and the compiler being pedantic too, I 
suppose.)

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();

arsenm wrote:

Yes, we can and should optimize this 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))

jhuber6 wrote:

Duh, good catch.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+ __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
+  return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
+}
+
+// Returns the absolute id of the thread in the current block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
+  return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
+ __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
+uint32_t index = step + __gpu_lane_id();
+x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return __gpu_broadcast_u32(__lane_mask, x);
+}
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
+  uint32_t x) {
+  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
+uint32_t index = __gpu_lane_id() - step;
+uint32_t bitmask = __gpu_lane_id() >= step;
+x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return x;
+}

jhuber6 wrote:

I'll need to look at how to do that on AMDGPU, maybe can add it in a follow-up.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+ __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {

jhuber6 wrote:

I'll probably just remove these helpers since they're probably not going to be 
used as much.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.

jhuber6 wrote:

Seems to vary between headers, I'll defer to someone like @AaronBallman.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {

jhuber6 wrote:

They're not technically builtins since they're defined in a header, and it's 
pretty verbose. I'm mostly just trying to follow the same pattern as the other 
intrinsic headers.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();

jhuber6 wrote:

I remember talking about this w/ @arsenm and he said that this should just be 
caught by `AMDGPUInstCombine` or something. Maybe I should do that at some 
point.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Shilei Tian via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {

shiltian wrote:

Do we want to have all those APIs prefix with `__builtin` instead of `__gpu`? 
They are builtins anyway. You can have `__builtin_gpu_num_blocks_y`, etc.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+__gpu_ballot(uint64_t __lane_mask, bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
+ 

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,184 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+ _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+ _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t lane_mask, uint32_t x) {
+  uint32_t mask = static_cast(lane_mask);
+  u

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+ __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
+  return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
+}
+
+// Returns the absolute id of the thread in the current block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
+  return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
+ __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
+  return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
+}
+
+// Gets the sum of all lanes inside the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
+  for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
+uint32_t index = step + __gpu_lane_id();
+x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return __gpu_broadcast_u32(__lane_mask, x);
+}
+
+// Gets the accumulator scan of the threads in the warp or wavefront.
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
+  uint32_t x) {
+  for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
+uint32_t index = __gpu_lane_id() - step;
+uint32_t bitmask = __gpu_lane_id() >= step;
+x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
+  }
+  return x;
+}

jdoerfert wrote:

We really want the 64 bit version as well.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))

jdoerfert wrote:

shouldn't we expose this to the user as well? I mean, explicitly in the 
gpuintr.h as defines or as comment.

Also, amdgpu_kernel? 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.

jdoerfert wrote:

Shouldn't these have three "///"?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+ __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {

jdoerfert wrote:

I still don't like the mix of 32 and 64 bit types. I think for users 64 
everywhere is much nicer, and it avoids the explicit casts above.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();

jdoerfert wrote:

I believe the new COV4/5 have this pre-computed and we should likely use it.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+

jdoerfert wrote:

Can we please have a dispatch for each _x/y/z function to pass an index? And 
define the indices.



https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS

jdoerfert wrote:

Oddly, you define it in one case and then test if it is not defined already.
That implies on non-hip/cuda it might be defined but otherwise it shouldn't be?
If the above is not true, e.g., if it should not be defined at all, make that 
clearer plz. 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,86 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, 
but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#else
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
+// Returns the total number of blocks / workgroups.
+_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
+  return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
+}
+
+// Returns the absolute id of the block / workgroup.
+_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
+  return __gpu_block_id_x() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
+ (uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
+ __gpu_block_id_z();
+}
+
+// Returns the total number of threads in the block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
+  return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
+}
+
+// Returns the absolute id of the thread in the current block / workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
+  return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
+ __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
+}
+
+// Get the first active thread inside the lane.
+_DEFAULT_ATTRS static inline uint64_t
+__gpu_first_lane_id(uint64_t __lane_mask) {
+  return __builtin_ffsll(__lane_mask) - 1;
+}
+
+// Conditional that is only true for a single thread in a lane.
+_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {

jdoerfert wrote:

The name is confusing. "first lane" implies lane 0, but this is "first active 
lane" or "fist lane in mask"

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {

jdoerfert wrote:

The name is too generic, IMHO. Broadcast is more generic than broadcast from 
lane 0.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert commented:

I left many minor comments but nothing blocking.

@jhuber6  Were there any conceptual concerns in the thread?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert edited 
https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-11-05 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

Ping

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-28 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From c9431203b10c930587a07eed099df9e3e4ebae00 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/6] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 0211d1870b30a0..b02a47c07666c1 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -271,6 +271,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -299,6 +305,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -521,6 +528,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -707,6 +715,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,76 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 

jhuber6 wrote:

I'm on the fence for providing fallbacks for non-GPU architectures, since I 
have that use-case in the `rpc.h` file.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))

arsenm wrote:

Probably should throw on whatever prefix you're using in the header (which I 
guess right now is __gpu).

Alternatively the ISA manuals and HSA specs have more names to use 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))

jhuber6 wrote:

I had the same thought, I was looking for something like `__OPENCL` to ifdef 
it, since I apparently can't use `_Private` since that's C reserved. What's the 
best solution here?

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+__gpu_ballot(uint64_t __lane_mask, bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
+ 

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,76 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 

arsenm wrote:

else error? 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From 014742418463fffa0b2d097fe668f02558addcc9 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/5] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..a0e7ae67b7219a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -268,6 +268,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -296,6 +302,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))
+
+// Attribute to declare a function as a kernel.
+#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_amdgcn_readfirstlane(__x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+__gpu_ballot(uint64_t __lane_mask, bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
+ 

[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-25 Thread Matt Arsenault via cfe-commits


@@ -0,0 +1,154 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define __private __attribute__((opencl_private))
+#define __constant __attribute__((opencl_constant))
+#define __local __attribute__((opencl_local))
+#define __global __attribute__((opencl_global))
+#define __generic __attribute__((opencl_generic))

arsenm wrote:

I'm not sure defining these directly to the OpenCL names is the best idea 

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-04 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

> IIRC, you discussed once to have GPU-agnostic intrinsics in LLVM-IR. The 
> backends then have to handle the details.

There's three approaches basically, wrapper header, builtins, and intrinsics. 
We could make some generic intrinsics but it would be a lot more work and 
duplicate a bunch of functions. I think the intrinsics were once suggested by 
@JonChesterfield, so maybe he could chime in. Even with this, it's still 
something we could provide in the future.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Thorsten Schütt via cfe-commits

tschuett wrote:

IIRC, you discussed once to have GPU-agnostic intrinsics in LLVM-IR. The 
backends then have to handle the details.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _Kernel __attribute__((amdgpu_kernel, visibility("protected")))

jhuber6 wrote:

The NVPTX target blatantly ignore visibility, so protected doesn't really make 
a huge difference. It does matter for LTO however which is why it's here. The 
GPU targets only ever go to an ELF target right now. We use protected for 
everything because this pretty much does exactly what we want, a symbol visible 
from the GPU ELF that can't be preempted so we don't need weird DSO things.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))

jhuber6 wrote:

That looks very OpenCL, I guess C just wants to reserve any top level name 
since they like to do `_Thread_local` and stuff? I can change it.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Saleem Abdulrasool via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _Kernel __attribute__((amdgpu_kernel, visibility("protected")))

compnerd wrote:

Should this have protected visibility? That is undefined for non-Linux 
environments I believe.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Saleem Abdulrasool via cfe-commits


@@ -0,0 +1,153 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device))
+#elif !defined(_DEFAULT_ATTRS)
+#define _DEFAULT_ATTRS
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _Private __attribute__((opencl_private))
+#define _Constant __attribute__((opencl_constant))
+#define _Local __attribute__((opencl_local))
+#define _Global __attribute__((opencl_global))

compnerd wrote:

These keywords are reserved by the C standard, I don't think that you should 
use them. You can use `__private`, `__constant`, `__local`, `__global` if you 
like.

https://github.com/llvm/llvm-project/pull/110179
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

2024-10-03 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/110179

>From 4a3348e56950583fb28211879f5ab157c34cbc66 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH 1/4] [Clang] Implement resource directory headers for common
 GPU intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 187 +++
 clang/lib/Headers/gpuintrin.h|  18 +++
 clang/lib/Headers/nvptxintrin.h  | 184 ++
 4 files changed, 403 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..a0e7ae67b7219a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -268,6 +268,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -296,6 +302,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -518,6 +525,7 @@ add_header_target("systemz-resource-headers" 
"${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -704,6 +712,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00..95936f86bd15b8
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,187 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include 
+#include 
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y'