[PATCH] D145401: [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting

2023-03-17 Thread Austin Kerbow via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG864a2b25beac: [AMDGPU] Reserve extra SGPR blocks wth XNACK 
any TID Setting (authored by kerbowa).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145401/new/

https://reviews.llvm.org/D145401

Files:
  clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
  llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
  llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
  llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
  llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
  llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
  llvm/test/CodeGen/AMDGPU/trap-abis.ll

Index: llvm/test/CodeGen/AMDGPU/trap-abis.ll
===
--- llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -17,7 +17,75 @@
 
 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: trap:
-; NOHSA-TRAP-GFX900-V2:   ; %bb.0:
+; NOHSA-TRAP-GFX900-V2: .amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_major = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_minor = 2
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_kind = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_major = 9
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_minor = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_stepping = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256
+; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: priority = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: float_mode = 240
+; NOHSA-TRAP-GFX900-V2-NEXT: priv = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_dx10_clamp = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_ieee_mode = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_wgp_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 4
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_info = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_vgpr_workitem_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception_msb = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_lds_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_wavefront_size32 = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_ordered_append_gds = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: private_element_size = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: is_ptr64 = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: is_dynamic_callstack = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: is_debug_enabled = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: is_xnack_enabled = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: workitem_private_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_group_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 44
+; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8
+; NOHSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_private_segment_buffer_sgpr = 0
+; 

[PATCH] D145401: [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting

2023-03-13 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

Added AMDGPU group to reviewers.

Is there any objection to changing the defaults for subtargets that support 
XNACK to always reserve extra SGPRs unless -xnack is explicitly requested? This 
would impact graphics as well. The old defaults were doing the opposite and 
only reserving the extra SGPRs with +xnack, meaning the default in the absence 
of either +/-xnack will be changing.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145401/new/

https://reviews.llvm.org/D145401

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D145401: [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting

2023-03-08 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 503461.
kerbowa added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Update tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D145401/new/

https://reviews.llvm.org/D145401

Files:
  clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
  llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
  llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
  llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
  llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
  llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
  llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
  llvm/test/CodeGen/AMDGPU/trap-abis.ll

Index: llvm/test/CodeGen/AMDGPU/trap-abis.ll
===
--- llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -17,7 +17,75 @@
 
 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: trap:
-; NOHSA-TRAP-GFX900-V2:   ; %bb.0:
+; NOHSA-TRAP-GFX900-V2: .amd_kernel_code_t
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_major = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_code_version_minor = 2
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_kind = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_major = 9
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_minor = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: amd_machine_version_stepping = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_entry_byte_offset = 256
+; NOHSA-TRAP-GFX900-V2-NEXT: kernel_code_prefetch_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_workitem_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_wavefront_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: priority = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: float_mode = 240
+; NOHSA-TRAP-GFX900-V2-NEXT: priv = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_dx10_clamp = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_ieee_mode = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_wgp_mode = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 4
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_info = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_vgpr_workitem_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception_msb = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: granulated_lds_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_exception = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_wavefront_size32 = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: enable_ordered_append_gds = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: private_element_size = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: is_ptr64 = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: is_dynamic_callstack = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: is_debug_enabled = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: is_xnack_enabled = 1
+; NOHSA-TRAP-GFX900-V2-NEXT: workitem_private_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_group_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 44
+; NOHSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8
+; NOHSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 2
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_first = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: reserved_sgpr_count = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: debug_private_segment_buffer_sgpr = 0
+; NOHSA-TRAP-GFX900-V2-NEXT: 

[PATCH] D132079: [AMDGPU] Add iglp_opt builtin and MFMA GEMM Opt strategy

2022-08-19 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added inline comments.



Comment at: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp:427
 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createIGroupLPDAGMutation());
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());

jrbyrnes wrote:
> I think you can remove this as well since you're doing it from within the 
> scheduler.
It's not added in the scheduler for plain SCHED_BARRIER.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D132079/new/

https://reviews.llvm.org/D132079

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin

2022-07-30 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa marked an inline comment as done.
kerbowa added inline comments.



Comment at: llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp:314
+
+bool SchedGroup::isFull() const {
+  return MaxSize && Collection.size() >= *MaxSize;

uabelho wrote:
> Compiling with gcc, I get a warning that this function is unused.
> I'm wondering, there seems to be both a const and a non-const version of the 
> isFull method now, but they are identical? Perhaps the non-const version 
> could be removed?
Removed in 7898426a72, thanks!


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128158/new/

https://reviews.llvm.org/D128158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin

2022-07-28 Thread Austin Kerbow via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGf5b21680d122: [AMDGPU] Add amdgcn_sched_group_barrier 
builtin (authored by kerbowa).

Changed prior to commit:
  https://reviews.llvm.org/D128158?vs=445965=448378#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128158/new/

https://reviews.llvm.org/D128158

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir

Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
@@ -0,0 +1,254 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_MFMA_VALU_and_SALU_alternating(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_group_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_group_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%2:areg_128 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from 

[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin

2022-07-19 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 445965.
kerbowa added a comment.

Fix some bugs. Add better pipeline fitting. Address comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128158/new/

https://reviews.llvm.org/D128158

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir

Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
@@ -0,0 +1,254 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_MFMA_VALU_and_SALU_alternating(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_group_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_group_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%2:areg_128 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) 

[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin

2022-06-20 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

Somewhat WIP needs more tests and cleanup. Posted for dependent work.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128158/new/

https://reviews.llvm.org/D128158

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128158: [AMDGPU] Add amdgcn_sched_group_barrier builtin

2022-06-20 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa created this revision.
kerbowa added reviewers: rampitec, jrbyrnes, vangthao95, arsenm.
Herald added subscribers: kosarev, jsilvanus, foad, hiraditya, t-tye, tpr, 
dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
Herald added a project: All.
kerbowa requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, wdng.
Herald added projects: clang, LLVM.

This builtin allows the creation of custom scheduling pipelines on a per-region
basis. Like the sched_barrier builtin this is intended to be used either for
testing, in situations where the default scheduler heuristics cannot be
improved, or in critical kernels where users are trying to get performance that
is close to handwritten assembly. Obviously using these builtins will require
extra work from the kernel writer to maintain the desired behavior.

The builtin can be used to create groups of instructions called "scheduling
groups" where ordering between the groups is enforced by the scheduler.
__builtin_amdgcn_sched_group_barrier takes three parameters. The first parameter
is a mask that determines the types of instructions that you would like to
synchronize around and add to a scheduling group. These instructions will be
selected from the bottom up starting from the sched_group_barrier's location
during instruction scheduling. The second parameter is the number of matching
instructions that will be associated with this sched_group_barrier. The third
parameter is an identifier which is used to describe what other
sched_group_barriers should be synchronized with. Note that multiple
sched_group_barriers must be added in order for them to be useful since they
only synchronize with other sched_group_barriers. Only "scheduling groups" with
a matching third parameter will have any enforced ordering between them.

As an example, the code below tries to create a pipeline of 1 VMEM_READ
instruction followed by 1 VALU instruction followed by 5 MFMA instructions...
// 1 VMEM_READ
__builtin_amdgcn_sched_group_barrier(32, 1, 0)
// 1 VALU
__builtin_amdgcn_sched_group_barrier(2, 1, 0)
// 5 MFMA
__builtin_amdgcn_sched_group_barrier(8, 5, 0)
// 1 VMEM_READ
__builtin_amdgcn_sched_group_barrier(32, 1, 0)
// 3 VALU
__builtin_amdgcn_sched_group_barrier(2, 3, 0)
// 2 VMEM_WRITE
__builtin_amdgcn_sched_group_barrier(64, 2, 0)


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128158

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir

Index: llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir
@@ -0,0 +1,173 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -amdgpu-disable-power-sched=true -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_group_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_group_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+; CHECK-NEXT: 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-05-11 Thread Austin Kerbow via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG2db700215a2e: [AMDGPU] Add llvm.amdgcn.sched.barrier 
intrinsic (authored by kerbowa).
Herald added a subscriber: kosarev.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-05-06 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 427747.
kerbowa added a comment.
Herald added a subscriber: jsilvanus.

Use i32.
Output hex.
Fix hazard rec tests for pseudo instructions.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

In D124700#3483633 , @rampitec wrote:

> In D124700#3483609 , @kerbowa wrote:
>
>> In D124700#3483556 , @rampitec 
>> wrote:
>>
>>> You do not handle masks other than 0 yet?
>>
>> We handle 0 and 1 only.
>
> Do you mean 1 is supported simply because it has side effects? If I 
> understand it right you will need to remove this to support more flexible 
> masks, right?

Yes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

In D124700#3483556 , @rampitec wrote:

> You do not handle masks other than 0 yet?

We handle 0 and 1 only.




Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:219
+// MASK = 0: No instructions may be scheduled across SCHED_BARRIER.
+// MASK = 1: Non-memory, non-side-effect producing instructions may be
+//   scheduled across SCHED_BARRIER, i.e. allow ALU instructions 
to pass.

rampitec wrote:
> Since you are going to extend it I'd suggest this is -1. Then you will start 
> carving bits outs of it. That way if someone start to use it it will still 
> work after update.
Since the most common use case will be to block all instruction types I thought 
having that be MASK = 0 made the most sense. After that, we carve out bits for 
types of instructions that should be scheduled across it.

There may be modes where we restrict certain types of memops, so we cannot have 
MASK = 1 above changed to -1. Since this (MASK = 1) is allowing all ALU across 
we could define which bits mean VALU/SALU/MFMA etc and use that mask if you 
think it's better. I'm worried we won't be able to anticipate all the types 
that we could want to be maskable. It might be better to just have a single bit 
that can mean all ALU, or all MemOps, and so on to avoid this problem.



Comment at: llvm/include/llvm/IR/IntrinsicsAMDGPU.td:222
+def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
+IntrHasSideEffects, IntrConvergent, 
IntrWillReturn]>;

rampitec wrote:
> Why not full i32? This is immediate anyway but you will have more bits for 
> the future.
Good point thanks.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 426169.
kerbowa added a comment.

Add mir tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124700/new/

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
  llvm/test/CodeGen/AMDGPU/sched_barrier.mir

Index: llvm/test/CodeGen/AMDGPU/sched_barrier.mir
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/sched_barrier.mir
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void }
+
+  !0 = distinct !{!0}
+  !1 = !{!1, !0}
+...
+
+---
+name: no_sched_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_sched_barrier
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_NOP 0
+%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+%6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec
+GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+S_ENDPGM 0
+...
+
+---
+name: sched_barrier_0
+tracksRegLiveness: true
+body: |
+  bb.0:
+; CHECK-LABEL: name: sched_barrier_0
+; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_NOP 0
+; CHECK-NEXT: SCHED_BARRIER 0
+; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
+; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+; CHECK-NEXT: S_ENDPGM 0
+%0:sreg_64 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit 

[PATCH] D124700: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic

2022-04-29 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa created this revision.
Herald added subscribers: hsmhsm, foad, hiraditya, t-tye, tpr, dstuttard, 
yaxunl, nhaehnle, jvesely, kzhuravl, arsenm.
Herald added a project: All.
kerbowa requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, wdng.
Herald added projects: clang, LLVM.

Adds an intrinsic/builtin that can be used to fine tune scheduler behavior. If
there is a need to have highly optimized codegen and kernel developers have
knowledge of inter-wave runtime behavior which is unknown to the compiler this
builtin can be used to tune scheduling.

This intrinsic creates a barrier between scheduling regions. The immediate
parameter is a mask to determine the types of instructions that should be
prevented from crossing the sched_barrier. In this initial patch, there are only
two variations. A mask of 0 means that no instructions may be scheduled across
the sched_barrier. A mask of 1 means that non-memory, non-side-effect inducing
instructions may cross the sched_barrier.

Note that this intrinsic is only meant to work with the scheduling passes. Any
other transformations that may move code will not be impacted in the ways
described above.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124700

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
  llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll

Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_wave_barrier() #0 {
+; GCN-LABEL: test_wave_barrier:
+; GCN:   ; %bb.0: ; %entry
+; GCN-NEXT:; sched_barrier mask(0)
+; GCN-NEXT:; sched_barrier mask(1)
+; GCN-NEXT:; sched_barrier mask(4)
+; GCN-NEXT:; sched_barrier mask(15)
+; GCN-NEXT:s_endpgm
+entry:
+  call void @llvm.amdgcn.sched.barrier(i16 0) #1
+  call void @llvm.amdgcn.sched.barrier(i16 1) #1
+  call void @llvm.amdgcn.sched.barrier(i16 4) #1
+  call void @llvm.amdgcn.sched.barrier(i16 15) #1
+  ret void
+}
+
+declare void @llvm.amdgcn.sched.barrier(i16) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
===
--- llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -148,6 +148,7 @@
 switch (II->getIntrinsicID()) {
 case Intrinsic::amdgcn_s_barrier:
 case Intrinsic::amdgcn_wave_barrier:
+case Intrinsic::amdgcn_sched_barrier:
   return false;
 default:
   break;
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -313,6 +313,18 @@
   let Size = 0;
 }
 
+def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i16imm:$mask),
+  [(int_amdgcn_sched_barrier (i16 timm:$mask))]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
+}
+
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
===
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1773,6 +1773,7 @@
   // hazard, even if one exist, won't really be visible. Should we handle it?
   case AMDGPU::SI_MASKED_UNREACHABLE:
   case AMDGPU::WAVE_BARRIER:
+  case AMDGPU::SCHED_BARRIER:
 return 0;
   }
 }
@@ -3490,6 +3491,9 @@
   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
 return true;
 
+  if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
+return true;
+
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
===
--- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -207,6 +207,14 @@
   return;
  

[PATCH] D120976: [AMDGPU] Add llvm.amdgcn.s.setprio intrinsic

2022-03-12 Thread Austin Kerbow via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG62bcfcb5a588: [AMDGPU] Add llvm.amdgcn.s.setprio intrinsic 
(authored by kerbowa).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D120976/new/

https://reviews.llvm.org/D120976

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/SOPInstructions.td
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll

Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+
+declare void @llvm.amdgcn.s.setprio(i16) #0
+
+define void @test_llvm_amdgcn_s_setprio() #0 {
+; GFX9-LABEL: test_llvm_amdgcn_s_setprio:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 3 ; encoding: [0x03,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 10 ; encoding: [0x0a,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; GFX9-NEXT:s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; SI-LABEL: test_llvm_amdgcn_s_setprio:
+; SI:   ; %bb.0:
+; SI-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; SI-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 3 ; encoding: [0x03,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 10 ; encoding: [0x0a,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; SI-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; SI-NEXT:s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+  call void @llvm.amdgcn.s.setprio(i16 0)
+  call void @llvm.amdgcn.s.setprio(i16 1)
+  call void @llvm.amdgcn.s.setprio(i16 2)
+  call void @llvm.amdgcn.s.setprio(i16 3)
+  call void @llvm.amdgcn.s.setprio(i16 10)
+  call void @llvm.amdgcn.s.setprio(i16 65535)
+  call void @llvm.amdgcn.s.setprio(i16 65536)
+  call void @llvm.amdgcn.s.setprio(i16 65537)
+  call void @llvm.amdgcn.s.setprio(i16 -1)
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: llvm/lib/Target/AMDGPU/SOPInstructions.td
===
--- llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1278,7 +1278,10 @@
   let hasSideEffects = 1;
 }
 
-def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
+  [(int_amdgcn_s_setprio timm:$simm16)]> {
+  let hasSideEffects = 1;
+}
 
 let Uses = [EXEC, M0] in {
 // FIXME: Should this be mayLoad+mayStore?
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1329,6 +1329,11 @@
   Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem,
 IntrHasSideEffects, IntrWillReturn]>;
 
+def int_amdgcn_s_setprio :
+  GCCBuiltin<"__builtin_amdgcn_s_setprio">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
+IntrHasSideEffects, IntrWillReturn]>;
+
 def int_amdgcn_s_getreg :
   GCCBuiltin<"__builtin_amdgcn_s_getreg">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Index: clang/test/SemaOpenCL/builtins-amdgcn-error.cl
===
--- clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ clang/test/SemaOpenCL/builtins-amdgcn-error.cl

[PATCH] D120976: [AMDGPU] Add llvm.amdgcn.s.setprio intrinsic

2022-03-10 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 414559.
kerbowa added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Add clang builtin and tests.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D120976/new/

https://reviews.llvm.org/D120976

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/SOPInstructions.td
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll

Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
===
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+
+declare void @llvm.amdgcn.s.setprio(i16) #0
+
+define void @test_llvm_amdgcn_s_setprio() #0 {
+; GFX9-LABEL: test_llvm_amdgcn_s_setprio:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 3 ; encoding: [0x03,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 10 ; encoding: [0x0a,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; GFX9-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; GFX9-NEXT:s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; SI-LABEL: test_llvm_amdgcn_s_setprio:
+; SI:   ; %bb.0:
+; SI-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; SI-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 3 ; encoding: [0x03,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 10 ; encoding: [0x0a,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; SI-NEXT:s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+; SI-NEXT:s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; SI-NEXT:s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+  call void @llvm.amdgcn.s.setprio(i16 0)
+  call void @llvm.amdgcn.s.setprio(i16 1)
+  call void @llvm.amdgcn.s.setprio(i16 2)
+  call void @llvm.amdgcn.s.setprio(i16 3)
+  call void @llvm.amdgcn.s.setprio(i16 10)
+  call void @llvm.amdgcn.s.setprio(i16 65535)
+  call void @llvm.amdgcn.s.setprio(i16 65536)
+  call void @llvm.amdgcn.s.setprio(i16 65537)
+  call void @llvm.amdgcn.s.setprio(i16 -1)
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: llvm/lib/Target/AMDGPU/SOPInstructions.td
===
--- llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1278,7 +1278,10 @@
   let hasSideEffects = 1;
 }
 
-def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
+def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
+  [(int_amdgcn_s_setprio timm:$simm16)]> {
+  let hasSideEffects = 1;
+}
 
 let Uses = [EXEC, M0] in {
 // FIXME: Should this be mayLoad+mayStore?
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1329,6 +1329,11 @@
   Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem,
 IntrHasSideEffects, IntrWillReturn]>;
 
+def int_amdgcn_s_setprio :
+  GCCBuiltin<"__builtin_amdgcn_s_setprio">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem,
+IntrHasSideEffects, IntrWillReturn]>;
+
 def int_amdgcn_s_getreg :
   GCCBuiltin<"__builtin_amdgcn_s_getreg">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Index: clang/test/SemaOpenCL/builtins-amdgcn-error.cl
===
--- clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ clang/test/SemaOpenCL/builtins-amdgcn-error.cl
@@ -54,6 +54,12 @@
   __builtin_amdgcn_s_decperflevel(x); 

[PATCH] D79213: [hip] Add noalias on restrict qualified coerced hip pointers

2020-05-04 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added a comment.

In D79213#2018820 , @hliao wrote:

> Any more comments? As this should be a performance-critical issue, shall we 
> get conclusion and make progress for the next step?


We applied this current version of the patch internally for testing which is 
the reason for the lack of urgency, but hopefully we can get the final generic 
solution submitted upstream early this week. I believe I understand your 
comments and it makes sense to me that it should be safe for pointers whose 
coerced type is also a single pointer to retain the same attributes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D79213/new/

https://reviews.llvm.org/D79213



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D79213: [hip] Add noalias on restrict qualified coerced hip pointers

2020-04-30 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added inline comments.



Comment at: clang/lib/CodeGen/CGCall.cpp:2270
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();

yaxunl wrote:
> kerbowa wrote:
> > yaxunl wrote:
> > > For struct containing pointers, we do recursive coercing 
> > > 
> > > https://github.com/llvm/llvm-project/blob/master/clang/lib/CodeGen/TargetInfo.cpp#L8224
> > > 
> > > So solely compare element type will fail. We need to add a test case for 
> > > struct containing pointer, and we need to have a recursive comparison in 
> > > a similar way as above code.
> > I can add it, but are we sure it's what we want? I think OpenCL/hcc wont 
> > have the same behavior because of 
> > https://github.com/llvm/llvm-project/blob/master/clang/lib/CodeGen/CGCall.cpp#L2435.
> Michael said without that there were extra flat load/stores. That's why he 
> did that intentionally.
Okay. What I mean is that OpenCL/hcc don't add noalias recursively to struct 
members that are restrict qualified. Is there any reason not to do it?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D79213/new/

https://reviews.llvm.org/D79213



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D79213: [hip] Add noalias on restrict qualified coerced hip pointers

2020-04-30 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa added inline comments.



Comment at: clang/lib/CodeGen/CGCall.cpp:2270
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();

yaxunl wrote:
> For struct containing pointers, we do recursive coercing 
> 
> https://github.com/llvm/llvm-project/blob/master/clang/lib/CodeGen/TargetInfo.cpp#L8224
> 
> So solely compare element type will fail. We need to add a test case for 
> struct containing pointer, and we need to have a recursive comparison in a 
> similar way as above code.
I can add it, but are we sure it's what we want? I think OpenCL/hcc wont have 
the same behavior because of 
https://github.com/llvm/llvm-project/blob/master/clang/lib/CodeGen/CGCall.cpp#L2435.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D79213/new/

https://reviews.llvm.org/D79213



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D79213: [hip] Add noalias on restrict qualified coerced hip pointers

2020-04-30 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa updated this revision to Diff 261388.
kerbowa added a comment.

Fix test formatting.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D79213/new/

https://reviews.llvm.org/D79213

Files:
  clang/lib/CodeGen/CGCall.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu


Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -67,3 +67,10 @@
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }
+
+// Check that coerced pointers retain the noalias attribute when qualified 
with __restrict.
+// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias 
%x.coerce)
+// HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+__global__ void kernel7(int *__restrict x) {
+  x[0]++;
+}
Index: clang/lib/CodeGen/CGCall.cpp
===
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2259,6 +2259,18 @@
   return CGF.Builder.CreateFPCast(value, varType, "arg.unpromote");
 }
 
+/// Returns true if the argument is a generic HIP pointer that was coerced to a
+/// global pointer.
+bool isCoercedHIPGlobalPointer(CodeGenFunction ,
+   const LangOptions ,
+   const ABIArgInfo , const QualType ) {
+  return LangOpts.HIP && isa(ArgI.getCoerceToType()) &&
+ ArgI.getCoerceToType()->getPointerAddressSpace() == 1 &&
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();
+}
+
 /// Returns the attribute (either parameter attribute, or function
 /// attribute), which declares argument ArgNo to be non-null.
 static const NonNullAttr *getNonNullAttr(const Decl *FD, const ParmVarDecl 
*PVD,
@@ -2541,6 +2553,14 @@
   // Pointer to store into.
   Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
 
+  // Restrict qualified HIP pointers that were coerced to global pointers
+  // can be marked with the noalias attribute.
+  if (isCoercedHIPGlobalPointer(*this, getLangOpts(), ArgI, Ty) &&
+  Arg->getType().isRestrictQualified()) {
+auto AI = cast(FnArgs[FirstIRArg]);
+AI->addAttr(llvm::Attribute::NoAlias);
+  }
+
   // Fast-isel and the optimizer generally like scalar values better than
   // FCAs, so we flatten them if this is safe to do for this argument.
   llvm::StructType *STy = 
dyn_cast(ArgI.getCoerceToType());


Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -67,3 +67,10 @@
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }
+
+// Check that coerced pointers retain the noalias attribute when qualified with __restrict.
+// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce)
+// HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+__global__ void kernel7(int *__restrict x) {
+  x[0]++;
+}
Index: clang/lib/CodeGen/CGCall.cpp
===
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2259,6 +2259,18 @@
   return CGF.Builder.CreateFPCast(value, varType, "arg.unpromote");
 }
 
+/// Returns true if the argument is a generic HIP pointer that was coerced to a
+/// global pointer.
+bool isCoercedHIPGlobalPointer(CodeGenFunction ,
+   const LangOptions ,
+   const ABIArgInfo , const QualType ) {
+  return LangOpts.HIP && isa(ArgI.getCoerceToType()) &&
+ ArgI.getCoerceToType()->getPointerAddressSpace() == 1 &&
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();
+}
+
 /// Returns the attribute (either parameter attribute, or function
 /// attribute), which declares argument ArgNo to be non-null.
 static const NonNullAttr *getNonNullAttr(const Decl *FD, const ParmVarDecl *PVD,
@@ -2541,6 +2553,14 @@
   // Pointer to store into.
   Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
 
+  // Restrict qualified HIP pointers that were coerced to global pointers
+  // can be marked with the noalias attribute.
+  if (isCoercedHIPGlobalPointer(*this, getLangOpts(), ArgI, Ty) &&
+  Arg->getType().isRestrictQualified()) {
+auto AI = cast(FnArgs[FirstIRArg]);
+AI->addAttr(llvm::Attribute::NoAlias);
+  }
+
   // Fast-isel and the optimizer generally like scalar values better than
   

[PATCH] D79213: [hip] Add noalias on restrict qualified coerced hip pointers

2020-04-30 Thread Austin Kerbow via Phabricator via cfe-commits
kerbowa created this revision.
kerbowa added reviewers: yaxunl, hliao.
Herald added subscribers: cfe-commits, nhaehnle, jvesely.
Herald added a project: clang.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D79213

Files:
  clang/lib/CodeGen/CGCall.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu


Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -67,3 +67,10 @@
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }
+
+// Check that coerced pointers retain the noalias attribute when qualified 
with __restrict.
+// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias 
%x.coerce)
+// HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+__global__ void kernel7(int* __restrict x) {
+  x[0]++;
+}
Index: clang/lib/CodeGen/CGCall.cpp
===
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2259,6 +2259,18 @@
   return CGF.Builder.CreateFPCast(value, varType, "arg.unpromote");
 }
 
+/// Returns true if the argument is a generic HIP pointer that was coerced to a
+/// global pointer.
+bool isCoercedHIPGlobalPointer(CodeGenFunction ,
+   const LangOptions ,
+   const ABIArgInfo , const QualType ) {
+  return LangOpts.HIP && isa(ArgI.getCoerceToType()) &&
+ ArgI.getCoerceToType()->getPointerAddressSpace() == 1 &&
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();
+}
+
 /// Returns the attribute (either parameter attribute, or function
 /// attribute), which declares argument ArgNo to be non-null.
 static const NonNullAttr *getNonNullAttr(const Decl *FD, const ParmVarDecl 
*PVD,
@@ -2541,6 +2553,14 @@
   // Pointer to store into.
   Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
 
+  // Restrict qualified HIP pointers that were coerced to global pointers
+  // can be marked with the noalias attribute.
+  if (isCoercedHIPGlobalPointer(*this, getLangOpts(), ArgI, Ty) &&
+  Arg->getType().isRestrictQualified()) {
+auto AI = cast(FnArgs[FirstIRArg]);
+AI->addAttr(llvm::Attribute::NoAlias);
+  }
+
   // Fast-isel and the optimizer generally like scalar values better than
   // FCAs, so we flatten them if this is safe to do for this argument.
   llvm::StructType *STy = 
dyn_cast(ArgI.getCoerceToType());


Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -67,3 +67,10 @@
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }
+
+// Check that coerced pointers retain the noalias attribute when qualified with __restrict.
+// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce)
+// HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+__global__ void kernel7(int* __restrict x) {
+  x[0]++;
+}
Index: clang/lib/CodeGen/CGCall.cpp
===
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2259,6 +2259,18 @@
   return CGF.Builder.CreateFPCast(value, varType, "arg.unpromote");
 }
 
+/// Returns true if the argument is a generic HIP pointer that was coerced to a
+/// global pointer.
+bool isCoercedHIPGlobalPointer(CodeGenFunction ,
+   const LangOptions ,
+   const ABIArgInfo , const QualType ) {
+  return LangOpts.HIP && isa(ArgI.getCoerceToType()) &&
+ ArgI.getCoerceToType()->getPointerAddressSpace() == 1 &&
+ CGF.ConvertType(Ty)->getPointerAddressSpace() == 0 &&
+ ArgI.getCoerceToType()->getPointerElementType() ==
+ CGF.ConvertType(Ty)->getPointerElementType();
+}
+
 /// Returns the attribute (either parameter attribute, or function
 /// attribute), which declares argument ArgNo to be non-null.
 static const NonNullAttr *getNonNullAttr(const Decl *FD, const ParmVarDecl *PVD,
@@ -2541,6 +2553,14 @@
   // Pointer to store into.
   Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI);
 
+  // Restrict qualified HIP pointers that were coerced to global pointers
+  // can be marked with the noalias attribute.
+  if (isCoercedHIPGlobalPointer(*this, getLangOpts(), ArgI, Ty) &&
+  Arg->getType().isRestrictQualified()) {
+auto AI = cast(FnArgs[FirstIRArg]);
+AI->addAttr(llvm::Attribute::NoAlias);
+  }
+
   // Fast-isel and the optimizer generally like scalar values better than