https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/165714
>From 8b235bc5aa943b06477d461c32912cde74052e58 Mon Sep 17 00:00:00 2001 From: ergawy <[email protected]> Date: Fri, 17 Oct 2025 08:35:07 -0500 Subject: [PATCH 1/4] [OpenMP][flang] Add initial support for by-ref reductions on the GPU Adds initial support for GPU by-ref reductions. In particular, this diff adds support for reductions on scalar allocatables where reductions happen on loops nested in `target` regions. For example: ```fortran integer :: i real, allocatable :: scalar_alloc allocate(scalar_alloc) scalar_alloc = 0 !$omp target map(tofrom: scalar_alloc) !$omp parallel do reduction(+: scalar_alloc) do i = 1, 1000000 scalar_alloc = scalar_alloc + 1 end do !$omp end target ``` This PR supports by-ref reductions on the intra- and inter-warp levels. So far, there are still steps to be takens for full support of by-ref reductions, for example: * Support inter-block value combination is still not supported. Therefore, `target teams distribute parallel do` is still not supported. * Support for dynamically-sized arrays still needs to be added. * Support for more than one allocatable/array on the same `reduction` clause. --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 4 +- .../include/flang/Optimizer/Dialect/FIROps.td | 3 +- .../lib/Lower/Support/ReductionProcessor.cpp | 10 +- .../OpenMP/DoConcurrentConversion.cpp | 3 +- .../delayed-privatization-reduction-byref.f90 | 2 +- .../parallel-reduction-allocatable-array.f90 | 2 +- .../OpenMP/parallel-reduction-array-lb.f90 | 2 +- .../Lower/OpenMP/parallel-reduction-array.f90 | 2 +- .../OpenMP/parallel-reduction-array2.f90 | 2 +- .../parallel-reduction-pointer-array.f90 | 2 +- .../test/Lower/OpenMP/parallel-reduction3.f90 | 2 +- .../OpenMP/reduction-array-intrinsic.f90 | 2 +- .../Lower/OpenMP/sections-array-reduction.f90 | 2 +- .../OpenMP/taskgroup-task-array-reduction.f90 | 2 +- ...oop-reduction-allocatable-array-minmax.f90 | 4 +- .../OpenMP/wsloop-reduction-allocatable.f90 | 2 +- .../wsloop-reduction-array-assumed-shape.f90 | 2 +- .../OpenMP/wsloop-reduction-array-lb.f90 | 2 +- .../OpenMP/wsloop-reduction-array-lb2.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-array.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-array2.f90 | 2 +- .../wsloop-reduction-multiple-clauses.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-pointer.f90 | 2 +- .../do_concurrent_reduce_allocatable.f90 | 2 +- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 53 ++++-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 161 ++++++++++++++---- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 4 +- .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp | 3 +- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 24 ++- .../LLVMIR/allocatable_gpu_reduction.mlir | 92 ++++++++++ .../omptarget-multi-block-reduction.mlir | 6 +- .../LLVMIR/omptarget-multi-reduction.mlir | 8 +- .../omptarget-teams-distribute-reduction.mlir | 2 +- .../LLVMIR/omptarget-teams-reduction.mlir | 2 +- 34 files changed, 327 insertions(+), 90 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 2f69a53787f0c..766e990e5e677 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1786,8 +1786,8 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createReductionsGPU( - OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false, + TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); CGF.Builder.restoreIP(AfterIP); diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index d416d6c61f178..392456d766c3c 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3792,7 +3792,8 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr<TypeAttr>:$byref_element_type); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index 605a5b6b20b94..1bce6b3b3c832 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -573,10 +573,18 @@ OpType ReductionProcessor::createDeclareReduction( mlir::OpBuilder modBuilder(module.getBodyRegion()); mlir::Type valTy = fir::unwrapRefType(type); + // For by-ref reductions, we want to keep track of the + // boxed/referenced/allocated type. For example, a for `real, allocatable` + // variable, `real` should be stored. + mlir::TypeAttr boxedTy{}; + if (!isByRef) type = valTy; - decl = OpType::create(modBuilder, loc, reductionOpName, type); + if (isByRef) + boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy)); + + decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy); createReductionAllocAndInitRegions(converter, loc, decl, redId, type, isByRef); diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 9aad8cddc60a1..1012a9608aa27 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -848,7 +848,8 @@ class DoConcurrentConversion if (!ompReducer) { ompReducer = mlir::omp::DeclareReductionOp::create( rewriter, firReducer.getLoc(), ompReducerName, - firReducer.getTypeAttr().getValue()); + firReducer.getTypeAttr().getValue(), + firReducer.getByrefElementTypeAttr()); cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(), ompReducer.getAllocRegion()); diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 index 4b6a643f94059..4c7b6ac5f5f9b 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 @@ -22,7 +22,7 @@ subroutine red_and_delayed_private ! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32 ! CHECK-LABEL: omp.declare_reduction -! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc +! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> attributes {byref_element_type = i32} alloc ! CHECK-LABEL: _QPred_and_delayed_private ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 index 41c7d69ebb3ba..f56875dcb518b 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc { ! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 index aa91e1e0e8b15..d9ba3bed464f8 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 @@ -12,7 +12,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>> ! CHECK: omp.yield(%[[VAL_15]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 index 59595de338d50..636660f279e85 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 @@ -17,7 +17,7 @@ program reduce print *,i end program -! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc { +! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> attributes {byref_element_type = !fir.array<3xi32>} alloc { ! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>> ! CPU: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) ! CPU-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 index 14338c6f50817..9cf8a63427ed1 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 @@ -13,7 +13,7 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 index 36344458d1cae..3de2ba8f61f8e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 @@ -19,7 +19,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90 index 6ff7f96b2b9bf..7437e1d35a624 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction3.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 index bd91fa51a6988..779322712dbfe 100644 --- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 +++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 @@ -9,7 +9,7 @@ subroutine max_array_reduction(l, r) !$omp end parallel end subroutine -! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90 index 1d286008a11f3..57e46c7bc8cae 100644 --- a/flang/test/Lower/OpenMP/sections-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90 @@ -14,7 +14,7 @@ subroutine sectionsReduction(x) end subroutine -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 index 18a4f75b86309..3a63bb09c59de 100644 --- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 index 2cd953de0dffa..ed81577ecce16 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 @@ -32,7 +32,7 @@ program reduce15 print *,"min: ", mins end program -! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { @@ -93,7 +93,7 @@ program reduce15 ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 index 663851cba46c6..d8c0a36db126e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> attributes {byref_element_type = i32} alloc { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>> ! CHECK: omp.yield(%[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 index 7184b3b102fd8..7ce1be03682b4 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 @@ -22,7 +22,7 @@ subroutine reduce(r) end subroutine end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 index 2233a74600948..ec448cf20f111 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 @@ -11,7 +11,7 @@ program reduce !$omp end parallel do end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 index 211bde19da8db..9da05a290ec21 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 @@ -19,7 +19,7 @@ subroutine sub(a, lb, ub) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 index afaeba27c5eae..14b657c8e180d 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> attributes {byref_element_type = !fir.array<2xi32>} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 index 25b2e97a1b7f7..d0a0c38e4ccb1 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 index edd2bcb1d6be8..60a162d8f8002 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 @@ -24,7 +24,7 @@ program main endprogram -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 index 27b726376fbeb..f640f5caddf76 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 @@ -18,7 +18,7 @@ program reduce_pointer deallocate(v) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 index 873fd10dd1b97..4fb67c094b594 100644 --- a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 +++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 @@ -8,7 +8,7 @@ subroutine do_concurrent_allocatable end do end subroutine -! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc { +! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] attributes {byref_element_type = !fir.array<?x?xf32>} alloc { ! CHECK: %[[ALLOC:.*]] = fir.alloca ! CHECK: fir.yield(%[[ALLOC]] : ![[RED_TYPE]]) ! CHECK: } init { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index f864a895a1259..dfffd5fa398f7 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1454,17 +1454,22 @@ class OpenMPIRBuilder { ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, - ReductionGenAtomicCBTy AtomicReductionGen) + ReductionGenAtomicCBTy AtomicReductionGen, + Type *ByRefAllocatedType = nullptr, + Type *ByRefElementType = nullptr) : ElementType(ElementType), Variable(Variable), PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), - AtomicReductionGen(AtomicReductionGen) {} + AtomicReductionGen(AtomicReductionGen), + ByRefAllocatedType(ByRefAllocatedType), + ByRefElementType(ByRefElementType) {} ReductionInfo(Value *PrivateVariable) : ElementType(nullptr), Variable(nullptr), PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} - /// Reduction element type, must match pointee type of variable. + /// Reduction element type, must match pointee type of variable. For by-ref + /// reductions, this would be just an opaque `ptr`. Type *ElementType; /// Reduction variable of pointer type. @@ -1491,6 +1496,18 @@ class OpenMPIRBuilder { /// reduction. If null, the implementation will use the non-atomic version /// along with the appropriate synchronization mechanisms. ReductionGenAtomicCBTy AtomicReductionGen; + + /// For by-ref reductions, we need to keep track of 2 extra types that are + /// potentially different: + /// * The allocated type is the type of the storage allocated by the + /// reduction op's `alloc` region. For example, for allocatables and arrays, + /// this type would be the descriptor/box struct. + Type *ByRefAllocatedType; + /// * The by-ref element type is the type of the actual storage needed for + /// the data of the allocatable or array. For example, an float allocatable + /// of would need some float storage to store intermediate reduction + /// results. + Type *ByRefElementType; }; enum class CopyAction : unsigned { @@ -1535,14 +1552,15 @@ class OpenMPIRBuilder { /// Function to shuffle over the value from the remote lane. void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, - Type *ElementType, Value *Offset, - Type *ReductionArrayTy); + Type *ElementType, Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem); /// Emit instructions to copy a Reduce list, which contains partially /// aggregated values, in the specified direction. void emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, + ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); /// Emit a helper that reduces data across two OpenMP threads (lanes) @@ -1616,11 +1634,13 @@ class OpenMPIRBuilder { /// \param ReduceFn The reduction function. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The ShuffleAndReduce function. Function *emitShuffleAndReduceFunction( ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, - Function *ReduceFn, AttributeList FuncAttrs); + Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef); /// Helper function for CreateCanonicalScanLoops to create InputLoop /// in the firstGen and Scan Loop in the SecondGen @@ -1680,12 +1700,14 @@ class OpenMPIRBuilder { /// \param ReductionInfos Array type containing the ReductionOps. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The InterWarpCopy function. Expected<Function *> emitInterWarpCopyFunction(const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, - AttributeList FuncAttrs); + AttributeList FuncAttrs, ArrayRef<bool> IsByRef); /// This function emits a helper that copies all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1779,6 +1801,7 @@ class OpenMPIRBuilder { /// \return The reduction function. Expected<Function *> createReductionFunction( StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, + ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, AttributeList FuncAttrs = {}); @@ -2031,11 +2054,14 @@ class OpenMPIRBuilder { /// reduction variables. /// \param AllocaIP An insertion point suitable for allocas usable /// in reductions. - /// \param CodeGenIP An insertion point suitable for code - /// generation. \param ReductionInfos A list of info on each reduction - /// variable. \param IsNoWait Optional flag set if the reduction is - /// marked as - /// nowait. + /// \param CodeGenIP An insertion point suitable for code + /// generation. + /// \param ReductionInfos A list of info on each reduction + /// variable. + /// \param IsNoWait Optional flag set if the reduction is + /// marked as nowait. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. /// \param GridValue Optional GPU grid value. @@ -2045,7 +2071,8 @@ class OpenMPIRBuilder { LLVM_ABI InsertPointOrErrorTy createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, - bool IsNoWait = false, bool IsTeamsReduction = false, + ArrayRef<bool> IsByRef, bool IsNoWait = false, + bool IsTeamsReduction = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5101717526263..b7410efa70e47 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2465,7 +2465,8 @@ Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, Type *ElemType, - Value *Offset, Type *ReductionArrayTy) { + Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem) { uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); // Create the loop over the big sized data. // ptr = (void*)Elem; @@ -2550,7 +2551,7 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, void OpenMPIRBuilder::emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, - CopyOptionsTy CopyOptions) { + ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) { Type *IndexTy = Builder.getIndexTy( M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; @@ -2560,6 +2561,7 @@ void OpenMPIRBuilder::emitReductionListCopy( for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); Value *SrcElementAddr = nullptr; + AllocaInst *DestAlloca = nullptr; Value *DestElementAddr = nullptr; Value *DestElementPtrAddr = nullptr; // Should we shuffle in an element from a remote lane? @@ -2579,14 +2581,18 @@ void OpenMPIRBuilder::emitReductionListCopy( DestElementPtrAddr = Builder.CreateInBoundsGEP( ReductionArrayTy, DestBase, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]); switch (Action) { case CopyAction::RemoteLaneToThread: { InsertPointTy CurIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); - AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, - ".omp.reduction.element"); + + Type *DestAllocaType = + IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType; + DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr, + ".omp.reduction.element"); DestAlloca->setAlignment( - M.getDataLayout().getPrefTypeAlign(RI.ElementType)); + M.getDataLayout().getPrefTypeAlign(DestAllocaType)); DestElementAddr = DestAlloca; DestElementAddr = Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), @@ -2606,8 +2612,49 @@ void OpenMPIRBuilder::emitReductionListCopy( // Now that all active lanes have read the element in the // Reduce list, shuffle over the value from the remote lane. if (ShuffleInElement) { - shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, - RemoteLaneOffset, ReductionArrayTy); + Type *ShuffleType = RI.ElementType; + Value *ShuffleSrcAddr = SrcElementAddr; + Value *ShuffleDestAddr = DestElementAddr; + Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0); + AllocaInst *LocalStorage = nullptr; + + if (IsByRefElem) { + assert(RI.ByRefElementType && "Expected by-ref element type to be set"); + assert(RI.ByRefAllocatedType && + "Expected by-ref allocated type to be set"); + // For by-ref reductions, we need to copy from the remote lane the + // actual value of the partial reduction computed by that remote lane; + // rather than, for example, a pointer to that data or, even worse, a + // pointer to the descriptor of the by-ref reduction element. + ShuffleType = RI.ByRefElementType; + + ShuffleSrcAddr = Builder.CreateGEP(RI.ByRefAllocatedType, + ShuffleSrcAddr, {Zero, Zero}); + ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr); + + { + auto OldIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + + LocalStorage = Builder.CreateAlloca(ShuffleType); + Builder.restoreIP(OldIP); + ShuffleDestAddr = LocalStorage; + } + } + + shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType, + RemoteLaneOffset, ReductionArrayTy, IsByRefElem); + + if (IsByRefElem) { + auto *GEP = + Builder.CreateGEP(RI.ByRefAllocatedType, + Builder.CreatePointerBitCastOrAddrSpaceCast( + DestAlloca, Builder.getPtrTy(), ".ascast"), + {Zero, Zero}); + Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast( + LocalStorage, Builder.getPtrTy(), ".ascast"), + GEP); + } } else { switch (RI.EvaluationKind) { case EvalKind::Scalar: { @@ -2662,7 +2709,7 @@ void OpenMPIRBuilder::emitReductionListCopy( Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef<bool> IsByRef) { InsertPointTy SavedIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( @@ -2743,7 +2790,9 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( // memory. // const ReductionInfo &RI = En.value(); - unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType); + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + unsigned RealTySize = M.getDataLayout().getTypeAllocSize( + IsByRefElem ? RI.ByRefElementType : RI.ElementType); for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) { Type *CType = Builder.getIntNTy(TySize * 8); @@ -2806,6 +2855,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( ConstantInt::get(IndexTy, En.index())}); // elemptr = ((CopyType*)(elemptrptr)) + I Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + + if (IsByRefElem) { + Type *Int32Ty = Builder.getInt32Ty(); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + ElemPtr = + Builder.CreateGEP(RI.ByRefAllocatedType, ElemPtr, {Zero, Zero}); + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); + } + if (NumIters > 1) ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt); @@ -2861,6 +2919,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( Value *TargetElemPtrVal = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr); Value *TargetElemPtr = TargetElemPtrVal; + + if (IsByRefElem) { + Type *Int32Ty = Builder.getInt32Ty(); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + TargetElemPtr = Builder.CreateGEP(RI.ByRefAllocatedType, TargetElemPtr, + {Zero, Zero}); + TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr); + } + if (NumIters > 1) TargetElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt); @@ -2897,7 +2964,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef<bool> IsByRef) { LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get(Builder.getVoidTy(), @@ -2976,9 +3043,10 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( // This loop iterates through the list of reduce elements and copies, // element by element, from a remote lane in the warp to RemoteReduceList, // hosted on the thread's stack. - emitReductionListCopy( - AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, - ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr}); + emitReductionListCopy(AllocaIP, CopyAction::RemoteLaneToThread, + RedListArrayTy, ReductionInfos, ReduceList, + RemoteListAddrCast, IsByRef, + {RemoteLaneOffset, nullptr, nullptr}); // The actions to be performed on the Remote Reduce list is dependent // on the algorithm version. @@ -3047,7 +3115,8 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, - ReductionInfos, RemoteListAddrCast, ReduceList); + ReductionInfos, RemoteListAddrCast, ReduceList, + IsByRef); Builder.CreateBr(CpyMergeBB); emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); @@ -3452,7 +3521,8 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { Expected<Function *> OpenMPIRBuilder::createReductionFunction( StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, - ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { + ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind, + AttributeList FuncAttrs) { auto *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, /* IsVarArg */ false); @@ -3513,8 +3583,14 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( LHSPtrs.emplace_back(LHSPtr); RHSPtrs.emplace_back(RHSPtr); } else { - Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); - Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *LHS = LHSPtr; + Value *RHS = RHSPtr; + + if (!IsByRef.empty() && !IsByRef[En.index()]) { + LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + } + Value *Reduced; InsertPointOrErrorTy AfterIP = RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); @@ -3524,7 +3600,9 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( return ReductionFunc; Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHSPtr); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, LHSPtr); } } @@ -3577,9 +3655,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, - bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind, - std::optional<omp::GV> GridValue, unsigned ReductionBufNum, - Value *SrcLocInfo) { + ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction, + ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, + unsigned ReductionBufNum, Value *SrcLocInfo) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); @@ -3615,9 +3693,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr); CodeGenIP = Builder.saveIP(); - Expected<Function *> ReductionResult = - createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(), - ReductionInfos, ReductionGenCBKind, FuncAttrs); + Expected<Function *> ReductionResult = createReductionFunction( + Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef, + ReductionGenCBKind, FuncAttrs); if (!ReductionResult) return ReductionResult.takeError(); Function *ReductionFunc = *ReductionResult; @@ -3656,15 +3734,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Value *ElemPtr = Builder.CreateInBoundsGEP( RedArrayTy, ReductionList, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + + auto *PrviateVar = RI.PrivateVariable; + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + if (IsByRefElem) + PrviateVar = Builder.CreateLoad(RI.ElementType, PrviateVar); + Value *CastElem = - Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(PrviateVar, PtrTy); Builder.CreateStore(CastElem, ElemPtr); } CodeGenIP = Builder.saveIP(); - Function *SarFunc = - emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs); + Function *SarFunc = emitShuffleAndReduceFunction( + ReductionInfos, ReductionFunc, FuncAttrs, IsByRef); Expected<Function *> CopyResult = - emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs); + emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef); if (!CopyResult) return CopyResult.takeError(); Function *WcFunc = *CopyResult; @@ -3743,7 +3827,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); - Value *LHS = RI.Variable; + Type *ValueType = RI.ElementType; + Value *RedValue = RI.Variable; Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); @@ -3754,7 +3839,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Fix the CallBack code genereated to use the correct Values for the LHS // and RHS - LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) { + LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) { return cast<Instruction>(U.getUser())->getParent()->getParent() == ReductionFunc; }); @@ -3763,15 +3848,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( ReductionFunc; }); } else { - Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs"); - Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs"); + if (IsByRef.empty() || !IsByRef[En.index()]) { + RedValue = Builder.CreateLoad(ValueType, RI.Variable, + "red.value." + Twine(En.index())); + } + Value *PrivateRedValue = Builder.CreateLoad( + ValueType, RHS, "red.private.value" + Twine(En.index())); Value *Reduced; InsertPointOrErrorTy AfterIP = - RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); + RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced); if (!AfterIP) return AfterIP.takeError(); Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHS, false); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, RI.Variable); } } emitBlock(ExitBB, CurFunc); @@ -3872,7 +3963,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions( assert(ReductionInfos.size() == IsByRef.size()); if (Config.isGPU()) return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos, - IsNoWait, IsTeamsReduction); + IsByRef, IsNoWait, IsTeamsReduction); checkReductionInfos(ReductionInfos, /*IsGPU*/ false); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 377f1febf6b8f..386174a36d52c 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -2011,7 +2011,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr<TypeAttr>:$byref_element_type + ); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp index 460595ba9f254..6423d49859c97 100644 --- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp +++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp @@ -188,7 +188,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable, OpBuilder::InsertionGuard guard(builder); Type type = reduce.getOperands()[reductionIndex].getType(); auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(), - "__scf_reduction", type); + "__scf_reduction", type, + /*byref_element_type=*/{}); symbolTable.insert(decl); builder.createBlock(&decl.getInitializerRegion(), diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8edec990eaaba..d0852b52f4193 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1311,7 +1311,8 @@ static void collectReductionInfo( SmallVectorImpl<OwningReductionGen> &owningReductionGens, SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens, const ArrayRef<llvm::Value *> privateReductionVariables, - SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) { + SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos, + ArrayRef<bool> isByRef) { unsigned numReductions = loop.getNumReductionVars(); for (unsigned i = 0; i < numReductions; ++i) { @@ -1329,12 +1330,27 @@ static void collectReductionInfo( atomicGen = owningAtomicReductionGens[i]; llvm::Value *variable = moduleTranslation.lookupValue(loop.getReductionVars()[i]); + mlir::Type allocatedType; + reductionDecls[i].getAllocRegion().walk([&](mlir::Operation *op) { + if (auto alloca = mlir::dyn_cast<LLVM::AllocaOp>(op)) { + allocatedType = alloca.getElemType(); + return mlir::WalkResult::interrupt(); + } + + return mlir::WalkResult::advance(); + }); + reductionInfos.push_back( {moduleTranslation.convertType(reductionDecls[i].getType()), variable, privateReductionVariables[i], /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, owningReductionGens[i], - /*ReductionGenClang=*/nullptr, atomicGen}); + /*ReductionGenClang=*/nullptr, atomicGen, + allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr, + reductionDecls[i].getByrefElementType() + ? moduleTranslation.convertType( + *reductionDecls[i].getByrefElementType()) + : nullptr}); } } @@ -1400,7 +1416,7 @@ static LogicalResult createReductionsAndCleanup( // ReductionInfo only accepts references to the generators. collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + privateReductionVariables, reductionInfos, isByRef); // The call to createReductions below expects the block to have a // terminator. Create an unreachable instruction to serve as terminator @@ -2732,7 +2748,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + privateReductionVariables, reductionInfos, isByRef); // Move to region cont block builder.SetInsertPoint((*regionBlock)->getTerminator()); diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir new file mode 100644 index 0000000000000..af3f5e68b6ddb --- /dev/null +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -0,0 +1,92 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QFfooEi_private_i32 : i32 + omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg1 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %7 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr + %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + %12 = llvm.load %9 : !llvm.ptr -> f32 + %13 = llvm.load %11 : !llvm.ptr -> f32 + %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath<contract>} : f32 + llvm.store %14, %9 : f32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } + llvm.func @foo_() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"} + omp.target map_entries(%10 -> %arg0 : !llvm.ptr) { + %13 = llvm.mlir.constant(1000 : i32) : i32 + %14 = llvm.mlir.constant(1 : i32) : i32 + omp.parallel { + omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg4 : !llvm.ptr) { + omp.loop_nest (%arg5) : i32 = (%14) to (%13) inclusive step (%14) { + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} { +// CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5) +// CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5) +// CHECK: %[[RED_ELEM_1:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr + +// CHECK: %[[SHUFFLE_ELEM:.*]] = alloca float, align 4, addrspace(5) +// CHECK: %[[REMOTE_RED_LIST_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[REMOTE_RED_LIST]] to ptr + +// CHECK: %[[REMOTE_RED_LIST_ELEM0:.*]] = getelementptr inbounds [1 x ptr], ptr %[[REMOTE_RED_LIST_ASCAST]], i64 0, i64 0 + +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: %[[SHUFFLE_RES:.*]] = call i32 @__kmpc_shuffle_int32({{.*}}) +// CHECK: store i32 %[[SHUFFLE_RES]], ptr %[[SHUFFLE_ELEM_ASCAST]], align 4 + +// CHECK: %[[RED_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr +// CHECK: %[[RED_ALLOC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_ASCAST]], i32 0, i32 0 +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: store ptr %[[SHUFFLE_ELEM_ASCAST]], ptr %[[RED_ALLOC_PTR]], align 8 +// CHECK: store ptr %[[RED_ELEM_1]], ptr %[[REMOTE_RED_LIST_ELEM0]], align 8 +// CHECK: } + +// CHECK: define {{.*}} @_omp_reduction_inter_warp_copy_func({{.*}}) {{.*}} { +// CHECK: %[[WARP_MASTER_CMP:.*]] = icmp eq i32 %nvptx_lane_id, 0 +// CHECK: br i1 %[[WARP_MASTER_CMP]], label %[[WARP_MASTER_BB:.*]], label %{{.*}} + +// CHECK: [[WARP_MASTER_BB]]: +// CHECK: %[[WARP_RESULT_PTR:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[WARP_RESULT:.*]] = load ptr, ptr %[[WARP_RESULT_PTR]], align 8 +// CHECK: %[[ALLOC_MEM_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[WARP_RESULT]], i32 0, i32 0 +// CHECK: %[[ALLOC_MEM:.*]] = load ptr, ptr %[[ALLOC_MEM_PTR]], align 8 +// CHECK: %[[WARP_TRANSFER_SLOT:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %nvptx_warp_id +// CHECK: %[[WARP_RED_RES:.*]] = load i32, ptr %[[ALLOC_MEM]], align 4 +// CHECK: store volatile i32 %[[WARP_RED_RES]], ptr addrspace(3) %[[WARP_TRANSFER_SLOT]], align 4 +// CHECK: } diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir index 87ff0ba786648..08a738c8fe4c6 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir @@ -7,7 +7,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : llvm.func @bar() {} llvm.func @baz() {} - omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc { + omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr attributes {byref_element_type = !llvm.array<5 x f32>} alloc { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5> %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr @@ -67,9 +67,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : // CHECK: br label %[[CONT_BB:.*]] // CHECK: [[CONT_BB]]: -// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ] -// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8 -// CHECK-NEXT: br label %.omp.reduction.done +// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %{{.*}}, %{{.*}} ] // CHECK: } // CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 { diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir index b8b7c780a74d0..8950db3fc48aa 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir @@ -109,19 +109,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]] // CHECK-NEXT: store double %[[FINAL_RESULT0]] -// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]] // CHECK-NEXT: store double %[[FINAL_RESULT1]] -// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]] // CHECK-NEXT: store float %[[FINAL_RESULT2]] -// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]] // CHECK-NEXT: store float %[[FINAL_RESULT3]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir index 9aba72dabf13c..b7cb1026967f3 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir @@ -59,8 +59,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_barrier // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir index dc22fe11666cf..36eb280dfcfa2 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir @@ -62,8 +62,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]] >From 0c9f4d9493708f61a53ec675f8256868f2084a74 Mon Sep 17 00:00:00 2001 From: ergawy <[email protected]> Date: Wed, 5 Nov 2025 07:19:23 -0600 Subject: [PATCH 2/4] review comments, Michael --- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 4 ++-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index dfffd5fa398f7..803c3c5e30eed 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1503,6 +1503,7 @@ class OpenMPIRBuilder { /// reduction op's `alloc` region. For example, for allocatables and arrays, /// this type would be the descriptor/box struct. Type *ByRefAllocatedType; + /// * The by-ref element type is the type of the actual storage needed for /// the data of the allocatable or array. For example, an float allocatable /// of would need some float storage to store intermediate reduction @@ -2060,8 +2061,7 @@ class OpenMPIRBuilder { /// variable. /// \param IsNoWait Optional flag set if the reduction is /// marked as nowait. - /// \param IsByRef For each reduction clause, whether the reduction is by-ref - /// or not. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. /// \param GridValue Optional GPU grid value. diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index b7410efa70e47..1d518d6ecdd41 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2633,7 +2633,7 @@ void OpenMPIRBuilder::emitReductionListCopy( ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr); { - auto OldIP = Builder.saveIP(); + InsertPointTy OldIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); LocalStorage = Builder.CreateAlloca(ShuffleType); @@ -3735,13 +3735,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( RedArrayTy, ReductionList, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); - auto *PrviateVar = RI.PrivateVariable; + Value *PrivateVar = RI.PrivateVariable; bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; if (IsByRefElem) - PrviateVar = Builder.CreateLoad(RI.ElementType, PrviateVar); + PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar); Value *CastElem = - Builder.CreatePointerBitCastOrAddrSpaceCast(PrviateVar, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy); Builder.CreateStore(CastElem, ElemPtr); } CodeGenIP = Builder.saveIP(); >From 7cf435dcd77a3477c373a7a722fd54d4430f2a46 Mon Sep 17 00:00:00 2001 From: ergawy <[email protected]> Date: Thu, 6 Nov 2025 03:37:23 -0600 Subject: [PATCH 3/4] review comments, Tom --- flang/lib/Lower/Support/ReductionProcessor.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index 1bce6b3b3c832..bd161cab0a11c 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -578,11 +578,10 @@ OpType ReductionProcessor::createDeclareReduction( // variable, `real` should be stored. mlir::TypeAttr boxedTy{}; - if (!isByRef) - type = valTy; - if (isByRef) boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy)); + else + type = valTy; decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy); createReductionAllocAndInitRegions(converter, loc, decl, redId, type, >From 7da2821570cd785151680f76469b708e995c316f Mon Sep 17 00:00:00 2001 From: ergawy <[email protected]> Date: Mon, 24 Nov 2025 00:32:17 -0600 Subject: [PATCH 4/4] Add `data_ptr_ptr` region to `declare_reduction` op. --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 +- .../include/flang/Optimizer/Dialect/FIROps.td | 9 +- .../lib/Lower/Support/ReductionProcessor.cpp | 44 ++++++++-- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 16 +++- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 82 ++++++++++++------- .../Frontend/OpenMPIRBuilderTest.cpp | 15 ++-- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 13 ++- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 37 +++++++++ .../LLVMIR/allocatable_gpu_reduction.mlir | 5 ++ .../omptarget-multi-block-reduction.mlir | 5 ++ 10 files changed, 182 insertions(+), 47 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 766e990e5e677..8a2b670403c31 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1780,7 +1780,8 @@ void CGOpenMPRuntimeGPU::emitReduction( }; ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( ElementType, Variable, PrivateVariable, EvalKind, - /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); + /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen, + /*DataPtrPtrGen=*/nullptr)); Idx++; } diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 392456d766c3c..115fcb5f82e60 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3782,6 +3782,9 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, allocated by the initializer region. The region has an argument that contains the value of the thread-local reduction accumulator. This will be executed after the reduction has completed. + 6. The DataPtrPtr region specifies how to access the base address of a + boxed-value. This is used, in particular, for GPU reductions in order + know where partial reduction resutls are stored in remote lanes. Note that the MLIR type system does not allow for type-polymorphic reductions. Separate reduction declarations should be created for different @@ -3799,14 +3802,16 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, AnyRegion:$atomicReductionRegion, - AnyRegion:$cleanupRegion); + AnyRegion:$cleanupRegion, + AnyRegion:$dataPtrPtrRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " "( `alloc` $allocRegion^ )? " "`init` $initializerRegion " "`combiner` $reductionRegion " "( `atomic` $atomicReductionRegion^ )? " - "( `cleanup` $cleanupRegion^ )? "; + "( `cleanup` $cleanupRegion^ )? " + "( `data_ptr_ptr` $dataPtrPtrRegion^ )? "; let extraClassDeclaration = [{ mlir::BlockArgument getAllocMoldArg() { diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index bd161cab0a11c..3cbf0558f5e08 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -573,17 +573,20 @@ OpType ReductionProcessor::createDeclareReduction( mlir::OpBuilder modBuilder(module.getBodyRegion()); mlir::Type valTy = fir::unwrapRefType(type); + // For by-ref reductions, we want to keep track of the // boxed/referenced/allocated type. For example, a for `real, allocatable` // variable, `real` should be stored. - mlir::TypeAttr boxedTy{}; + mlir::TypeAttr boxedTyAttr{}; + mlir::Type boxedTy; - if (isByRef) - boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy)); - else + if (isByRef) { + boxedTy = fir::unwrapPassByRefType(valTy); + boxedTyAttr = mlir::TypeAttr::get(boxedTy); + } else type = valTy; - decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy); + decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTyAttr); createReductionAllocAndInitRegions(converter, loc, decl, redId, type, isByRef); @@ -596,6 +599,37 @@ OpType ReductionProcessor::createDeclareReduction( mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); genCombiner<OpType>(builder, loc, redId, type, op1, op2, isByRef); + if (isByRef && fir::isa_box_type(valTy)) { + bool isBoxReductionSupported = [&]() { + auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>( + *builder.getModule()); + + // This check tests the implementation status on the GPU. Box reductions + // are fully supported on the CPU. + if (!offloadMod.getIsGPU()) + return true; + + auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxedTy); + + // Dynamically-shaped arrays are not supported yet on the GPU. + return !seqTy || !fir::sequenceWithNonConstantShape(seqTy); + }(); + + if (!isBoxReductionSupported) { + TODO(loc, "Reduction of dynamically-shaped arrays are not supported yet " + "on the GPU."); + } + + mlir::Region &dataPtrPtrRegion = decl.getDataPtrPtrRegion(); + mlir::Block &dataAddrBlock = *builder.createBlock( + &dataPtrPtrRegion, dataPtrPtrRegion.end(), {type}, {loc}); + builder.setInsertionPointToEnd(&dataAddrBlock); + mlir::Value boxRefOperand = dataAddrBlock.getArgument(0); + mlir::Value baseAddrOffset = fir::BoxOffsetOp::create( + builder, loc, boxRefOperand, fir::BoxFieldAttr::base_addr); + genYield<OpType>(builder, loc, baseAddrOffset); + } + return decl; } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 803c3c5e30eed..7b097d1ac0ee0 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1446,6 +1446,9 @@ class OpenMPIRBuilder { using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy( InsertPointTy, Type *, Value *, Value *)>; + using ReductionGenDataPtrPtrCBTy = std::function<InsertPointOrErrorTy( + InsertPointTy, Value *ByRefVal, Value *&Res)>; + /// Enum class for reduction evaluation types scalar, complex and aggregate. enum class EvalKind { Scalar, Complex, Aggregate }; @@ -1455,18 +1458,21 @@ class OpenMPIRBuilder { EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, ReductionGenAtomicCBTy AtomicReductionGen, + ReductionGenDataPtrPtrCBTy DataPtrPtrGen, Type *ByRefAllocatedType = nullptr, Type *ByRefElementType = nullptr) : ElementType(ElementType), Variable(Variable), PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), - AtomicReductionGen(AtomicReductionGen), + AtomicReductionGen(AtomicReductionGen), DataPtrPtrGen(DataPtrPtrGen), ByRefAllocatedType(ByRefAllocatedType), ByRefElementType(ByRefElementType) {} + ReductionInfo(Value *PrivateVariable) : ElementType(nullptr), Variable(nullptr), PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), - ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} + ReductionGen(), ReductionGenClang(), AtomicReductionGen(), + DataPtrPtrGen() {} /// Reduction element type, must match pointee type of variable. For by-ref /// reductions, this would be just an opaque `ptr`. @@ -1497,6 +1503,8 @@ class OpenMPIRBuilder { /// along with the appropriate synchronization mechanisms. ReductionGenAtomicCBTy AtomicReductionGen; + ReductionGenDataPtrPtrCBTy DataPtrPtrGen; + /// For by-ref reductions, we need to keep track of 2 extra types that are /// potentially different: /// * The allocated type is the type of the storage allocated by the @@ -1558,7 +1566,7 @@ class OpenMPIRBuilder { /// Emit instructions to copy a Reduce list, which contains partially /// aggregated values, in the specified direction. - void emitReductionListCopy( + Error emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, ArrayRef<bool> IsByRef, @@ -1639,7 +1647,7 @@ class OpenMPIRBuilder { /// or not. /// /// \return The ShuffleAndReduce function. - Function *emitShuffleAndReduceFunction( + Expected<Function *> emitShuffleAndReduceFunction( ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 1d518d6ecdd41..c962368859730 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2548,7 +2548,7 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, } } -void OpenMPIRBuilder::emitReductionListCopy( +Error OpenMPIRBuilder::emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) { @@ -2615,7 +2615,6 @@ void OpenMPIRBuilder::emitReductionListCopy( Type *ShuffleType = RI.ElementType; Value *ShuffleSrcAddr = SrcElementAddr; Value *ShuffleDestAddr = DestElementAddr; - Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0); AllocaInst *LocalStorage = nullptr; if (IsByRefElem) { @@ -2628,8 +2627,12 @@ void OpenMPIRBuilder::emitReductionListCopy( // pointer to the descriptor of the by-ref reduction element. ShuffleType = RI.ByRefElementType; - ShuffleSrcAddr = Builder.CreateGEP(RI.ByRefAllocatedType, - ShuffleSrcAddr, {Zero, Zero}); + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr); + + if (!GenResult) + return GenResult.takeError(); + ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr); { @@ -2646,11 +2649,16 @@ void OpenMPIRBuilder::emitReductionListCopy( RemoteLaneOffset, ReductionArrayTy, IsByRefElem); if (IsByRefElem) { - auto *GEP = - Builder.CreateGEP(RI.ByRefAllocatedType, - Builder.CreatePointerBitCastOrAddrSpaceCast( - DestAlloca, Builder.getPtrTy(), ".ascast"), - {Zero, Zero}); + Value *GEP; + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), + Builder.CreatePointerBitCastOrAddrSpaceCast( + DestAlloca, Builder.getPtrTy(), ".ascast"), + GEP); + + if (!GenResult) + return GenResult.takeError(); + Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast( LocalStorage, Builder.getPtrTy(), ".ascast"), GEP); @@ -2705,6 +2713,8 @@ void OpenMPIRBuilder::emitReductionListCopy( Builder.CreateStore(CastDestAddr, DestElementPtrAddr); } } + + return Error::success(); } Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( @@ -2857,10 +2867,12 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); if (IsByRefElem) { - Type *Int32Ty = Builder.getInt32Ty(); - Constant *Zero = ConstantInt::get(Int32Ty, 0); - ElemPtr = - Builder.CreateGEP(RI.ByRefAllocatedType, ElemPtr, {Zero, Zero}); + InsertPointOrErrorTy GenRes = + RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr); + + if (!GenRes) + return GenRes.takeError(); + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); } @@ -2921,10 +2933,12 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( Value *TargetElemPtr = TargetElemPtrVal; if (IsByRefElem) { - Type *Int32Ty = Builder.getInt32Ty(); - Constant *Zero = ConstantInt::get(Int32Ty, 0); - TargetElemPtr = Builder.CreateGEP(RI.ByRefAllocatedType, TargetElemPtr, - {Zero, Zero}); + InsertPointOrErrorTy GenRes = + RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr); + + if (!GenRes) + return GenRes.takeError(); + TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr); } @@ -2962,7 +2976,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( return WcFunc; } -Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( +Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) { LLVMContext &Ctx = M.getContext(); @@ -3043,10 +3057,13 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( // This loop iterates through the list of reduce elements and copies, // element by element, from a remote lane in the warp to RemoteReduceList, // hosted on the thread's stack. - emitReductionListCopy(AllocaIP, CopyAction::RemoteLaneToThread, - RedListArrayTy, ReductionInfos, ReduceList, - RemoteListAddrCast, IsByRef, - {RemoteLaneOffset, nullptr, nullptr}); + Error EmitRedLsCpRes = emitReductionListCopy( + AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, + ReduceList, RemoteListAddrCast, IsByRef, + {RemoteLaneOffset, nullptr, nullptr}); + + if (EmitRedLsCpRes) + return EmitRedLsCpRes; // The actions to be performed on the Remote Reduce list is dependent // on the algorithm version. @@ -3114,9 +3131,14 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); - emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, - ReductionInfos, RemoteListAddrCast, ReduceList, - IsByRef); + + EmitRedLsCpRes = emitReductionListCopy( + AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos, + RemoteListAddrCast, ReduceList, IsByRef); + + if (EmitRedLsCpRes) + return EmitRedLsCpRes; + Builder.CreateBr(CpyMergeBB); emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); @@ -3745,8 +3767,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.CreateStore(CastElem, ElemPtr); } CodeGenIP = Builder.saveIP(); - Function *SarFunc = emitShuffleAndReduceFunction( + Expected<Function *> SarFunc = emitShuffleAndReduceFunction( ReductionInfos, ReductionFunc, FuncAttrs, IsByRef); + + if (!SarFunc) + return SarFunc.takeError(); + Expected<Function *> CopyResult = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef); if (!CopyResult) @@ -3768,7 +3794,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt64(MaxDataSize * ReductionInfos.size()); if (!IsTeamsReduction) { Value *SarFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy); Value *WcFuncCast = Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy); Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, @@ -3800,7 +3826,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt32(ReductionBufNum), ReductionDataSize, RL, - SarFunc, + *SarFunc, WcFunc, LtGCFunc, LtGRFunc, diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 0b3ae643e1494..a479606d10e2d 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -5298,10 +5298,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { OpenMPIRBuilder::ReductionInfo ReductionInfos[] = { {SumType, SumReduced, SumPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}, + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}, {XorType, XorReduced, XorPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, - /*ReductionGenClang=*/nullptr, xorAtomicReduction}}; + /*ReductionGenClang=*/nullptr, xorAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}; OMPBuilder.Config.setIsGPU(false); bool ReduceVariableByRef[] = {false, false}; @@ -5536,7 +5538,8 @@ TEST_F(OpenMPIRBuilderTest, ScanReduction) { SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos = { {Builder.getFloatTy(), OrigVar, ScanVar, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}}; + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}; OpenMPIRBuilder::LocationDescription RedLoc({InputLoop->getAfterIP(), DL}); llvm::BasicBlock *Cont = splitBB(Builder, false, "omp.scan.loop.cont"); ASSERT_EXPECTED_INIT( @@ -5708,7 +5711,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { FirstBodyIP, FirstBodyAllocaIP, {{SumType, SumReduced, SumPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}}, + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}, ReduceVariableByRef), Succeeded()); ASSERT_THAT_EXPECTED( @@ -5716,7 +5720,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { SecondBodyIP, SecondBodyAllocaIP, {{XorType, XorReduced, XorPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, - /*ReductionGenClang=*/nullptr, xorAtomicReduction}}, + /*ReductionGenClang=*/nullptr, xorAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}, ReduceVariableByRef), Succeeded()); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 386174a36d52c..056a252620b1e 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -2001,6 +2001,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, allocated by the initializer region. The region has an argument that contains the value of the thread-local reduction accumulator. This will be executed after the reduction has completed. + 6. The DataPtrPtr region specifies how to access the base address of a + boxed-value. This is used, in particular, for GPU reductions in order + know where partial reduction resutls are stored in remote lanes. Note that the MLIR type system does not allow for type-polymorphic reductions. Separate reduction declarations should be created for different @@ -2019,14 +2022,16 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, AnyRegion:$atomicReductionRegion, - AnyRegion:$cleanupRegion); + AnyRegion:$cleanupRegion, + AnyRegion:$dataPtrPtrRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " "( `alloc` $allocRegion^ )? " "`init` $initializerRegion " "`combiner` $reductionRegion " "( `atomic` $atomicReductionRegion^ )? " - "( `cleanup` $cleanupRegion^ )? "; + "( `cleanup` $cleanupRegion^ )? " + "( `data_ptr_ptr` $dataPtrPtrRegion^ )? "; let extraClassDeclaration = [{ BlockArgument getAllocMoldArg() { @@ -2058,6 +2063,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, auto ®ion = getCleanupRegion(); return region.empty() ? nullptr : region.getArgument(0); } + BlockArgument getDataPtrPtrRegionArg() { + auto ®ion = getDataPtrPtrRegion(); + return region.empty() ? nullptr : region.getArgument(0); + } PointerLikeType getAccumulatorType() { if (getAtomicReductionRegion().empty()) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index d0852b52f4193..5f2f15b4d9346 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -953,6 +953,9 @@ using OwningAtomicReductionGen = std::function<llvm::OpenMPIRBuilder::InsertPointOrErrorTy( llvm::OpenMPIRBuilder::InsertPointTy, llvm::Type *, llvm::Value *, llvm::Value *)>; +using OwningDataPtrPtrReductionGen = + std::function<llvm::OpenMPIRBuilder::InsertPointOrErrorTy( + llvm::OpenMPIRBuilder::InsertPointTy, llvm::Value *, llvm::Value *&)>; } // namespace /// Create an OpenMPIRBuilder-compatible reduction generator for the given @@ -1017,6 +1020,31 @@ makeAtomicReductionGen(omp::DeclareReductionOp decl, return atomicGen; } +static OwningDataPtrPtrReductionGen +makeRefDataPtrGen(omp::DeclareReductionOp decl, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, bool isByRef) { + if (!isByRef) + return OwningDataPtrPtrReductionGen(); + + OwningDataPtrPtrReductionGen refDataPtrGen = + [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint, + llvm::Value *byRefVal, llvm::Value *&result) mutable + -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { + moduleTranslation.mapValue(decl.getDataPtrPtrRegionArg(), byRefVal); + builder.restoreIP(insertPoint); + SmallVector<llvm::Value *> phis; + if (failed(inlineConvertOmpRegions(decl.getDataPtrPtrRegion(), + "omp.data_ptr_ptr.body", builder, + moduleTranslation, &phis))) + return llvm::createStringError( + "failed to inline `data_ptr_ptr` region of `omp.declare_reduction`"); + result = llvm::getSingleElement(phis); + return builder.saveIP(); + }; + + return refDataPtrGen; +} + /// Converts an OpenMP 'ordered' operation into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1310,6 +1338,7 @@ static void collectReductionInfo( SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls, SmallVectorImpl<OwningReductionGen> &owningReductionGens, SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens, + SmallVector<OwningDataPtrPtrReductionGen> &owningDataPtrPtrReductionGens, const ArrayRef<llvm::Value *> privateReductionVariables, SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos, ArrayRef<bool> isByRef) { @@ -1320,6 +1349,8 @@ static void collectReductionInfo( makeReductionGen(reductionDecls[i], builder, moduleTranslation)); owningAtomicReductionGens.push_back( makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); + owningDataPtrPtrReductionGens.push_back(makeRefDataPtrGen( + reductionDecls[i], builder, moduleTranslation, isByRef[i])); } // Collect the reduction information. @@ -1346,6 +1377,7 @@ static void collectReductionInfo( /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, owningReductionGens[i], /*ReductionGenClang=*/nullptr, atomicGen, + owningDataPtrPtrReductionGens[i], allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr, reductionDecls[i].getByrefElementType() ? moduleTranslation.convertType( @@ -1408,6 +1440,7 @@ static LogicalResult createReductionsAndCleanup( SmallVector<OwningReductionGen> owningReductionGens; SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; + SmallVector<OwningDataPtrPtrReductionGen> owningReductionGenRefDataPtrGens; SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); @@ -1416,6 +1449,7 @@ static LogicalResult createReductionsAndCleanup( // ReductionInfo only accepts references to the generators. collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, + owningReductionGenRefDataPtrGens, privateReductionVariables, reductionInfos, isByRef); // The call to createReductions below expects the block to have a @@ -2745,9 +2779,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, // Collect reduction info SmallVector<OwningReductionGen> owningReductionGens; SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; + SmallVector<OwningDataPtrPtrReductionGen> + owningReductionGenRefDataPtrGens; SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, + owningReductionGenRefDataPtrGens, privateReductionVariables, reductionInfos, isByRef); // Move to region cont block diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir index af3f5e68b6ddb..df606150b760a 100644 --- a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -31,7 +31,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath<contract>} : f32 llvm.store %14, %9 : f32, !llvm.ptr omp.yield(%arg0 : !llvm.ptr) + } data_ptr_ptr { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + omp.yield(%0 : !llvm.ptr) } + llvm.func @foo_() { %0 = llvm.mlir.constant(1 : i64) : i64 %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5> diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir index 08a738c8fe4c6..2a3628b4cee03 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir @@ -23,7 +23,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ^bb3: // pred: ^bb1 llvm.call @baz() : () -> () omp.yield(%arg0 : !llvm.ptr) + } data_ptr_ptr { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + omp.yield(%0 : !llvm.ptr) } + llvm.func @foo_() { %c1 = llvm.mlir.constant(1 : i64) : i64 %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5> _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
