Author: Kareem Ergawy Date: 2025-11-26T11:59:22+01:00 New Revision: f481f5bef90d0dd10a9d885b2e4bbfd015e6606d
URL: https://github.com/llvm/llvm-project/commit/f481f5bef90d0dd10a9d885b2e4bbfd015e6606d DIFF: https://github.com/llvm/llvm-project/commit/f481f5bef90d0dd10a9d885b2e4bbfd015e6606d.diff LOG: [OpenMP][flang] Add initial support for by-ref reductions on the GPU (#165714) Adds initial support for GPU by-ref reductions. The main problem for reduction by reference is that, prior to this PR, we were shuffling (from remote lanes within the same warp or across different warps within the block) pointers/references to the private reduction values rather than the private reduction values themselves. In particular, this diff adds support for reductions on scalar allocatables where reductions happen on loops nested in `target` regions. For example: ```fortran integer :: i real, allocatable :: scalar_alloc allocate(scalar_alloc) scalar_alloc = 0 !$omp target map(tofrom: scalar_alloc) !$omp parallel do reduction(+: scalar_alloc) do i = 1, 1000000 scalar_alloc = scalar_alloc + 1 end do !$omp end target ``` This PR supports by-ref reductions on the intra- and inter-warp levels. So far, there are still steps to be takens for full support of by-ref reductions, for example: * Support inter-block value combination is still not supported. Therefore, `target teams distribute parallel do` is still not supported. * Support for dynamically-sized arrays still needs to be added. * Support for more than one allocatable/array on the same `reduction` clause. Added: mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir Modified: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp flang/include/flang/Optimizer/Dialect/FIROps.td flang/lib/Lower/Support/ReductionProcessor.cpp flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 flang/test/Lower/OpenMP/parallel-reduction-array.f90 flang/test/Lower/OpenMP/parallel-reduction-array2.f90 flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 flang/test/Lower/OpenMP/parallel-reduction3.f90 flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 flang/test/Lower/OpenMP/sections-array-reduction.f90 flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 flang/test/Lower/OpenMP/wsloop-reduction-array.f90 flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 flang/test/Lower/do_concurrent_reduce_allocatable.f90 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir Removed: ################################################################################ diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 2f69a53787f0c..572d59edb99b2 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1727,7 +1727,7 @@ void CGOpenMPRuntimeGPU::emitReduction( CGF.Builder.GetInsertPoint()); llvm::OpenMPIRBuilder::LocationDescription OmpLoc( CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); - llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; + llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo, 2> ReductionInfos; CodeGenFunction::OMPPrivateScope Scope(CGF); unsigned Idx = 0; @@ -1780,14 +1780,15 @@ void CGOpenMPRuntimeGPU::emitReduction( }; ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( ElementType, Variable, PrivateVariable, EvalKind, - /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); + /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen, + /*DataPtrPtrGen=*/nullptr)); Idx++; } llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createReductionsGPU( - OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false, + TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); CGF.Builder.restoreIP(AfterIP); diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index d416d6c61f178..5d16b9816e318 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3753,7 +3753,7 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, duplication at the moment. TODO Combine both ops into one. See: https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108. - Declares a `do concurrent` reduction. This requires two mandatory and three + Declares a `do concurrent` reduction. This requires two mandatory and four optional regions. 1. The optional alloc region specifies how to allocate the thread-local @@ -3782,6 +3782,9 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, allocated by the initializer region. The region has an argument that contains the value of the thread-local reduction accumulator. This will be executed after the reduction has completed. + 6. The DataPtrPtr region specifies how to access the base address of a + boxed-value. This is used, in particular, for GPU reductions in order + know where partial reduction results are stored in remote lanes. Note that the MLIR type system does not allow for type-polymorphic reductions. Separate reduction declarations should be created for diff erent @@ -3789,23 +3792,30 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, For initializer and reduction regions, the operand to `fir.yield` must match the parent operation's results. + + * `$byref_element_type`: For by-ref reductions, we want to keep track of the + boxed/allocated type. For example, for a `real, allocatable` variable, + `real` should be stored in this attribute. }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr<TypeAttr>:$byref_element_type); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, AnyRegion:$atomicReductionRegion, - AnyRegion:$cleanupRegion); + AnyRegion:$cleanupRegion, + AnyRegion:$dataPtrPtrRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " "( `alloc` $allocRegion^ )? " "`init` $initializerRegion " "`combiner` $reductionRegion " "( `atomic` $atomicReductionRegion^ )? " - "( `cleanup` $cleanupRegion^ )? "; + "( `cleanup` $cleanupRegion^ )? " + "( `data_ptr_ptr` $dataPtrPtrRegion^ )? "; let extraClassDeclaration = [{ mlir::BlockArgument getAllocMoldArg() { diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index 721cb45cd7d24..db8ad909b1d2f 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -572,10 +572,21 @@ DeclareRedType ReductionProcessor::createDeclareReductionHelper( mlir::OpBuilder modBuilder(module.getBodyRegion()); mlir::Type valTy = fir::unwrapRefType(type); - if (!isByRef) + + // For by-ref reductions, we want to keep track of the + // boxed/referenced/allocated type. For example, for a `real, allocatable` + // variable, `real` should be stored. + mlir::TypeAttr boxedTyAttr{}; + mlir::Type boxedTy; + + if (isByRef) { + boxedTy = fir::unwrapPassByRefType(valTy); + boxedTyAttr = mlir::TypeAttr::get(boxedTy); + } else type = valTy; - decl = DeclareRedType::create(modBuilder, loc, reductionOpName, type); + decl = DeclareRedType::create(modBuilder, loc, reductionOpName, type, + boxedTyAttr); createReductionAllocAndInitRegions(converter, loc, decl, genInitValueCB, type, isByRef); builder.createBlock(&decl.getReductionRegion(), @@ -585,6 +596,38 @@ DeclareRedType ReductionProcessor::createDeclareReductionHelper( mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); genCombinerCB(builder, loc, type, op1, op2, isByRef); + + if (isByRef && fir::isa_box_type(valTy)) { + bool isBoxReductionSupported = [&]() { + auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>( + *builder.getModule()); + + // This check tests the implementation status on the GPU. Box reductions + // are fully supported on the CPU. + if (!offloadMod.getIsGPU()) + return true; + + auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxedTy); + + // Dynamically-shaped arrays are not supported yet on the GPU. + return !seqTy || !fir::sequenceWithNonConstantShape(seqTy); + }(); + + if (!isBoxReductionSupported) { + TODO(loc, "Reduction of dynamically-shaped arrays are not supported yet " + "on the GPU."); + } + + mlir::Region &dataPtrPtrRegion = decl.getDataPtrPtrRegion(); + mlir::Block &dataAddrBlock = *builder.createBlock( + &dataPtrPtrRegion, dataPtrPtrRegion.end(), {type}, {loc}); + builder.setInsertionPointToEnd(&dataAddrBlock); + mlir::Value boxRefOperand = dataAddrBlock.getArgument(0); + mlir::Value baseAddrOffset = fir::BoxOffsetOp::create( + builder, loc, boxRefOperand, fir::BoxFieldAttr::base_addr); + genYield<DeclareRedType>(builder, loc, baseAddrOffset); + } + return decl; } diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 9aad8cddc60a1..1012a9608aa27 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -848,7 +848,8 @@ class DoConcurrentConversion if (!ompReducer) { ompReducer = mlir::omp::DeclareReductionOp::create( rewriter, firReducer.getLoc(), ompReducerName, - firReducer.getTypeAttr().getValue()); + firReducer.getTypeAttr().getValue(), + firReducer.getByrefElementTypeAttr()); cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(), ompReducer.getAllocRegion()); diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 index 4b6a643f94059..4c7b6ac5f5f9b 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 @@ -22,7 +22,7 @@ subroutine red_and_delayed_private ! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32 ! CHECK-LABEL: omp.declare_reduction -! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc +! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> attributes {byref_element_type = i32} alloc ! CHECK-LABEL: _QPred_and_delayed_private ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 index 41c7d69ebb3ba..f56875dcb518b 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc { ! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 index aa91e1e0e8b15..d9ba3bed464f8 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 @@ -12,7 +12,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>> ! CHECK: omp.yield(%[[VAL_15]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 index 59595de338d50..636660f279e85 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 @@ -17,7 +17,7 @@ program reduce print *,i end program -! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc { +! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> attributes {byref_element_type = !fir.array<3xi32>} alloc { ! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>> ! CPU: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) ! CPU-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 index 14338c6f50817..9cf8a63427ed1 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 @@ -13,7 +13,7 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 index 36344458d1cae..3de2ba8f61f8e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 @@ -19,7 +19,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90 index 6ff7f96b2b9bf..7437e1d35a624 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction3.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 index bd91fa51a6988..779322712dbfe 100644 --- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 +++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 @@ -9,7 +9,7 @@ subroutine max_array_reduction(l, r) !$omp end parallel end subroutine -! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xi32>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90 index 1d286008a11f3..57e46c7bc8cae 100644 --- a/flang/test/Lower/OpenMP/sections-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90 @@ -14,7 +14,7 @@ subroutine sectionsReduction(x) end subroutine -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 index 18a4f75b86309..3a63bb09c59de 100644 --- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 index 2cd953de0dffa..ed81577ecce16 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 @@ -32,7 +32,7 @@ program reduce15 print *,"min: ", mins end program -! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { @@ -93,7 +93,7 @@ program reduce15 ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 index 663851cba46c6..d8c0a36db126e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> attributes {byref_element_type = i32} alloc { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>> ! CHECK: omp.yield(%[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 index 7184b3b102fd8..7ce1be03682b4 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 @@ -22,7 +22,7 @@ subroutine reduce(r) end subroutine end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 index 2233a74600948..ec448cf20f111 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 @@ -11,7 +11,7 @@ program reduce !$omp end parallel do end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 index 211bde19da8db..9da05a290ec21 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 @@ -19,7 +19,7 @@ subroutine sub(a, lb, ub) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 index afaeba27c5eae..14b657c8e180d 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> attributes {byref_element_type = !fir.array<2xi32>} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 index 25b2e97a1b7f7..d0a0c38e4ccb1 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 index edd2bcb1d6be8..60a162d8f8002 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 @@ -24,7 +24,7 @@ program main endprogram -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 index 27b726376fbeb..f640f5caddf76 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 @@ -18,7 +18,7 @@ program reduce_pointer deallocate(v) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 index 873fd10dd1b97..4fb67c094b594 100644 --- a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 +++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 @@ -8,7 +8,7 @@ subroutine do_concurrent_allocatable end do end subroutine -! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc { +! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] attributes {byref_element_type = !fir.array<?x?xf32>} alloc { ! CHECK: %[[ALLOC:.*]] = fir.alloca ! CHECK: fir.yield(%[[ALLOC]] : ![[RED_TYPE]]) ! CHECK: } init { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index f864a895a1259..7b097d1ac0ee0 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1446,6 +1446,9 @@ class OpenMPIRBuilder { using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy( InsertPointTy, Type *, Value *, Value *)>; + using ReductionGenDataPtrPtrCBTy = std::function<InsertPointOrErrorTy( + InsertPointTy, Value *ByRefVal, Value *&Res)>; + /// Enum class for reduction evaluation types scalar, complex and aggregate. enum class EvalKind { Scalar, Complex, Aggregate }; @@ -1454,17 +1457,25 @@ class OpenMPIRBuilder { ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, - ReductionGenAtomicCBTy AtomicReductionGen) + ReductionGenAtomicCBTy AtomicReductionGen, + ReductionGenDataPtrPtrCBTy DataPtrPtrGen, + Type *ByRefAllocatedType = nullptr, + Type *ByRefElementType = nullptr) : ElementType(ElementType), Variable(Variable), PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), - AtomicReductionGen(AtomicReductionGen) {} + AtomicReductionGen(AtomicReductionGen), DataPtrPtrGen(DataPtrPtrGen), + ByRefAllocatedType(ByRefAllocatedType), + ByRefElementType(ByRefElementType) {} + ReductionInfo(Value *PrivateVariable) : ElementType(nullptr), Variable(nullptr), PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), - ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} + ReductionGen(), ReductionGenClang(), AtomicReductionGen(), + DataPtrPtrGen() {} - /// Reduction element type, must match pointee type of variable. + /// Reduction element type, must match pointee type of variable. For by-ref + /// reductions, this would be just an opaque `ptr`. Type *ElementType; /// Reduction variable of pointer type. @@ -1491,6 +1502,21 @@ class OpenMPIRBuilder { /// reduction. If null, the implementation will use the non-atomic version /// along with the appropriate synchronization mechanisms. ReductionGenAtomicCBTy AtomicReductionGen; + + ReductionGenDataPtrPtrCBTy DataPtrPtrGen; + + /// For by-ref reductions, we need to keep track of 2 extra types that are + /// potentially diff erent: + /// * The allocated type is the type of the storage allocated by the + /// reduction op's `alloc` region. For example, for allocatables and arrays, + /// this type would be the descriptor/box struct. + Type *ByRefAllocatedType; + + /// * The by-ref element type is the type of the actual storage needed for + /// the data of the allocatable or array. For example, an float allocatable + /// of would need some float storage to store intermediate reduction + /// results. + Type *ByRefElementType; }; enum class CopyAction : unsigned { @@ -1535,14 +1561,15 @@ class OpenMPIRBuilder { /// Function to shuffle over the value from the remote lane. void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, - Type *ElementType, Value *Offset, - Type *ReductionArrayTy); + Type *ElementType, Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem); /// Emit instructions to copy a Reduce list, which contains partially /// aggregated values, in the specified direction. - void emitReductionListCopy( + Error emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, + ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); /// Emit a helper that reduces data across two OpenMP threads (lanes) @@ -1616,11 +1643,13 @@ class OpenMPIRBuilder { /// \param ReduceFn The reduction function. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The ShuffleAndReduce function. - Function *emitShuffleAndReduceFunction( + Expected<Function *> emitShuffleAndReduceFunction( ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, - Function *ReduceFn, AttributeList FuncAttrs); + Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef); /// Helper function for CreateCanonicalScanLoops to create InputLoop /// in the firstGen and Scan Loop in the SecondGen @@ -1680,12 +1709,14 @@ class OpenMPIRBuilder { /// \param ReductionInfos Array type containing the ReductionOps. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The InterWarpCopy function. Expected<Function *> emitInterWarpCopyFunction(const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, - AttributeList FuncAttrs); + AttributeList FuncAttrs, ArrayRef<bool> IsByRef); /// This function emits a helper that copies all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1779,6 +1810,7 @@ class OpenMPIRBuilder { /// \return The reduction function. Expected<Function *> createReductionFunction( StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, + ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, AttributeList FuncAttrs = {}); @@ -2031,11 +2063,13 @@ class OpenMPIRBuilder { /// reduction variables. /// \param AllocaIP An insertion point suitable for allocas usable /// in reductions. - /// \param CodeGenIP An insertion point suitable for code - /// generation. \param ReductionInfos A list of info on each reduction - /// variable. \param IsNoWait Optional flag set if the reduction is - /// marked as - /// nowait. + /// \param CodeGenIP An insertion point suitable for code + /// generation. + /// \param ReductionInfos A list of info on each reduction + /// variable. + /// \param IsNoWait Optional flag set if the reduction is + /// marked as nowait. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. /// \param GridValue Optional GPU grid value. @@ -2045,7 +2079,8 @@ class OpenMPIRBuilder { LLVM_ABI InsertPointOrErrorTy createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, - bool IsNoWait = false, bool IsTeamsReduction = false, + ArrayRef<bool> IsByRef, bool IsNoWait = false, + bool IsTeamsReduction = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5101717526263..c962368859730 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2465,7 +2465,8 @@ Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, Type *ElemType, - Value *Offset, Type *ReductionArrayTy) { + Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem) { uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); // Create the loop over the big sized data. // ptr = (void*)Elem; @@ -2547,10 +2548,10 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, } } -void OpenMPIRBuilder::emitReductionListCopy( +Error OpenMPIRBuilder::emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, - CopyOptionsTy CopyOptions) { + ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) { Type *IndexTy = Builder.getIndexTy( M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; @@ -2560,6 +2561,7 @@ void OpenMPIRBuilder::emitReductionListCopy( for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); Value *SrcElementAddr = nullptr; + AllocaInst *DestAlloca = nullptr; Value *DestElementAddr = nullptr; Value *DestElementPtrAddr = nullptr; // Should we shuffle in an element from a remote lane? @@ -2579,14 +2581,18 @@ void OpenMPIRBuilder::emitReductionListCopy( DestElementPtrAddr = Builder.CreateInBoundsGEP( ReductionArrayTy, DestBase, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]); switch (Action) { case CopyAction::RemoteLaneToThread: { InsertPointTy CurIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); - AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, - ".omp.reduction.element"); + + Type *DestAllocaType = + IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType; + DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr, + ".omp.reduction.element"); DestAlloca->setAlignment( - M.getDataLayout().getPrefTypeAlign(RI.ElementType)); + M.getDataLayout().getPrefTypeAlign(DestAllocaType)); DestElementAddr = DestAlloca; DestElementAddr = Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), @@ -2606,8 +2612,57 @@ void OpenMPIRBuilder::emitReductionListCopy( // Now that all active lanes have read the element in the // Reduce list, shuffle over the value from the remote lane. if (ShuffleInElement) { - shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, - RemoteLaneOffset, ReductionArrayTy); + Type *ShuffleType = RI.ElementType; + Value *ShuffleSrcAddr = SrcElementAddr; + Value *ShuffleDestAddr = DestElementAddr; + AllocaInst *LocalStorage = nullptr; + + if (IsByRefElem) { + assert(RI.ByRefElementType && "Expected by-ref element type to be set"); + assert(RI.ByRefAllocatedType && + "Expected by-ref allocated type to be set"); + // For by-ref reductions, we need to copy from the remote lane the + // actual value of the partial reduction computed by that remote lane; + // rather than, for example, a pointer to that data or, even worse, a + // pointer to the descriptor of the by-ref reduction element. + ShuffleType = RI.ByRefElementType; + + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr); + + if (!GenResult) + return GenResult.takeError(); + + ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr); + + { + InsertPointTy OldIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + + LocalStorage = Builder.CreateAlloca(ShuffleType); + Builder.restoreIP(OldIP); + ShuffleDestAddr = LocalStorage; + } + } + + shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType, + RemoteLaneOffset, ReductionArrayTy, IsByRefElem); + + if (IsByRefElem) { + Value *GEP; + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), + Builder.CreatePointerBitCastOrAddrSpaceCast( + DestAlloca, Builder.getPtrTy(), ".ascast"), + GEP); + + if (!GenResult) + return GenResult.takeError(); + + Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast( + LocalStorage, Builder.getPtrTy(), ".ascast"), + GEP); + } } else { switch (RI.EvaluationKind) { case EvalKind::Scalar: { @@ -2658,11 +2713,13 @@ void OpenMPIRBuilder::emitReductionListCopy( Builder.CreateStore(CastDestAddr, DestElementPtrAddr); } } + + return Error::success(); } Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef<bool> IsByRef) { InsertPointTy SavedIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( @@ -2743,7 +2800,9 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( // memory. // const ReductionInfo &RI = En.value(); - unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType); + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + unsigned RealTySize = M.getDataLayout().getTypeAllocSize( + IsByRefElem ? RI.ByRefElementType : RI.ElementType); for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) { Type *CType = Builder.getIntNTy(TySize * 8); @@ -2806,6 +2865,17 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( ConstantInt::get(IndexTy, En.index())}); // elemptr = ((CopyType*)(elemptrptr)) + I Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + + if (IsByRefElem) { + InsertPointOrErrorTy GenRes = + RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr); + + if (!GenRes) + return GenRes.takeError(); + + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); + } + if (NumIters > 1) ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt); @@ -2861,6 +2931,17 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( Value *TargetElemPtrVal = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr); Value *TargetElemPtr = TargetElemPtrVal; + + if (IsByRefElem) { + InsertPointOrErrorTy GenRes = + RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr); + + if (!GenRes) + return GenRes.takeError(); + + TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr); + } + if (NumIters > 1) TargetElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt); @@ -2895,9 +2976,9 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( return WcFunc; } -Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( +Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef<bool> IsByRef) { LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get(Builder.getVoidTy(), @@ -2976,9 +3057,13 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( // This loop iterates through the list of reduce elements and copies, // element by element, from a remote lane in the warp to RemoteReduceList, // hosted on the thread's stack. - emitReductionListCopy( + Error EmitRedLsCpRes = emitReductionListCopy( AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, - ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr}); + ReduceList, RemoteListAddrCast, IsByRef, + {RemoteLaneOffset, nullptr, nullptr}); + + if (EmitRedLsCpRes) + return EmitRedLsCpRes; // The actions to be performed on the Remote Reduce list is dependent // on the algorithm version. @@ -3046,8 +3131,14 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); - emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, - ReductionInfos, RemoteListAddrCast, ReduceList); + + EmitRedLsCpRes = emitReductionListCopy( + AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos, + RemoteListAddrCast, ReduceList, IsByRef); + + if (EmitRedLsCpRes) + return EmitRedLsCpRes; + Builder.CreateBr(CpyMergeBB); emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); @@ -3452,7 +3543,8 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { Expected<Function *> OpenMPIRBuilder::createReductionFunction( StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, - ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { + ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind, + AttributeList FuncAttrs) { auto *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, /* IsVarArg */ false); @@ -3513,8 +3605,14 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( LHSPtrs.emplace_back(LHSPtr); RHSPtrs.emplace_back(RHSPtr); } else { - Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); - Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *LHS = LHSPtr; + Value *RHS = RHSPtr; + + if (!IsByRef.empty() && !IsByRef[En.index()]) { + LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + } + Value *Reduced; InsertPointOrErrorTy AfterIP = RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); @@ -3524,7 +3622,9 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( return ReductionFunc; Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHSPtr); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, LHSPtr); } } @@ -3577,9 +3677,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, - bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind, - std::optional<omp::GV> GridValue, unsigned ReductionBufNum, - Value *SrcLocInfo) { + ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction, + ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, + unsigned ReductionBufNum, Value *SrcLocInfo) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); @@ -3615,9 +3715,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr); CodeGenIP = Builder.saveIP(); - Expected<Function *> ReductionResult = - createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(), - ReductionInfos, ReductionGenCBKind, FuncAttrs); + Expected<Function *> ReductionResult = createReductionFunction( + Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef, + ReductionGenCBKind, FuncAttrs); if (!ReductionResult) return ReductionResult.takeError(); Function *ReductionFunc = *ReductionResult; @@ -3656,15 +3756,25 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Value *ElemPtr = Builder.CreateInBoundsGEP( RedArrayTy, ReductionList, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + + Value *PrivateVar = RI.PrivateVariable; + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + if (IsByRefElem) + PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar); + Value *CastElem = - Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy); Builder.CreateStore(CastElem, ElemPtr); } CodeGenIP = Builder.saveIP(); - Function *SarFunc = - emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs); + Expected<Function *> SarFunc = emitShuffleAndReduceFunction( + ReductionInfos, ReductionFunc, FuncAttrs, IsByRef); + + if (!SarFunc) + return SarFunc.takeError(); + Expected<Function *> CopyResult = - emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs); + emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef); if (!CopyResult) return CopyResult.takeError(); Function *WcFunc = *CopyResult; @@ -3684,7 +3794,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt64(MaxDataSize * ReductionInfos.size()); if (!IsTeamsReduction) { Value *SarFuncCast = - Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy); Value *WcFuncCast = Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy); Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, @@ -3716,7 +3826,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Builder.getInt32(ReductionBufNum), ReductionDataSize, RL, - SarFunc, + *SarFunc, WcFunc, LtGCFunc, LtGRFunc, @@ -3743,7 +3853,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); - Value *LHS = RI.Variable; + Type *ValueType = RI.ElementType; + Value *RedValue = RI.Variable; Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); @@ -3754,7 +3865,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Fix the CallBack code genereated to use the correct Values for the LHS // and RHS - LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) { + LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) { return cast<Instruction>(U.getUser())->getParent()->getParent() == ReductionFunc; }); @@ -3763,15 +3874,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( ReductionFunc; }); } else { - Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs"); - Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs"); + if (IsByRef.empty() || !IsByRef[En.index()]) { + RedValue = Builder.CreateLoad(ValueType, RI.Variable, + "red.value." + Twine(En.index())); + } + Value *PrivateRedValue = Builder.CreateLoad( + ValueType, RHS, "red.private.value" + Twine(En.index())); Value *Reduced; InsertPointOrErrorTy AfterIP = - RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); + RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced); if (!AfterIP) return AfterIP.takeError(); Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHS, false); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, RI.Variable); } } emitBlock(ExitBB, CurFunc); @@ -3872,7 +3989,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions( assert(ReductionInfos.size() == IsByRef.size()); if (Config.isGPU()) return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos, - IsNoWait, IsTeamsReduction); + IsByRef, IsNoWait, IsTeamsReduction); checkReductionInfos(ReductionInfos, /*IsGPU*/ false); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 0b3ae643e1494..1f35b7a5cfaa4 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -5298,10 +5298,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { OpenMPIRBuilder::ReductionInfo ReductionInfos[] = { {SumType, SumReduced, SumPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}, + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}, {XorType, XorReduced, XorPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, - /*ReductionGenClang=*/nullptr, xorAtomicReduction}}; + /*ReductionGenClang=*/nullptr, xorAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}; OMPBuilder.Config.setIsGPU(false); bool ReduceVariableByRef[] = {false, false}; @@ -5533,10 +5535,11 @@ TEST_F(OpenMPIRBuilderTest, ScanReduction) { EXPECT_EQ(ScanLoop->getAfter(), Builder.GetInsertBlock()); EXPECT_EQ(NumBodiesGenerated, 2U); - SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos = { + SmallVector<OpenMPIRBuilder::ReductionInfo, 2> ReductionInfos = { {Builder.getFloatTy(), OrigVar, ScanVar, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}}; + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}; OpenMPIRBuilder::LocationDescription RedLoc({InputLoop->getAfterIP(), DL}); llvm::BasicBlock *Cont = splitBB(Builder, false, "omp.scan.loop.cont"); ASSERT_EXPECTED_INIT( @@ -5708,7 +5711,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { FirstBodyIP, FirstBodyAllocaIP, {{SumType, SumReduced, SumPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, - /*ReductionGenClang=*/nullptr, sumAtomicReduction}}, + /*ReductionGenClang=*/nullptr, sumAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}, ReduceVariableByRef), Succeeded()); ASSERT_THAT_EXPECTED( @@ -5716,7 +5720,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { SecondBodyIP, SecondBodyAllocaIP, {{XorType, XorReduced, XorPrivatized, /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, - /*ReductionGenClang=*/nullptr, xorAtomicReduction}}, + /*ReductionGenClang=*/nullptr, xorAtomicReduction, + /*DataPtrPtrGen=*/nullptr}}, ReduceVariableByRef), Succeeded()); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 377f1febf6b8f..bbfe805eefe48 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1972,7 +1972,7 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, Symbol]> { let summary = "declares a reduction kind"; let description = [{ - Declares an OpenMP reduction kind. This requires two mandatory and three + Declares an OpenMP reduction kind. This requires two mandatory and four optional regions. 1. The optional alloc region specifies how to allocate the thread-local @@ -2001,6 +2001,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, allocated by the initializer region. The region has an argument that contains the value of the thread-local reduction accumulator. This will be executed after the reduction has completed. + 6. The DataPtrPtr region specifies how to access the base address of a + descriptor. This is used, in particular, for GPU reductions in order + know where partial reduction results are stored in remote lanes. Note that the MLIR type system does not allow for type-polymorphic reductions. Separate reduction declarations should be created for diff erent @@ -2008,23 +2011,32 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, For initializer and reduction regions, the operand to `omp.yield` must match the parent operation's results. + + * `$byref_element_type`: For by-ref reductions, we want to keep track of the + boxed/allocated type. For example, for a `real, allocatable` variable, + `real` should be stored in this attribute. + }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr<TypeAttr>:$byref_element_type + ); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, AnyRegion:$atomicReductionRegion, - AnyRegion:$cleanupRegion); + AnyRegion:$cleanupRegion, + MaxSizedRegion<1>:$dataPtrPtrRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " "( `alloc` $allocRegion^ )? " "`init` $initializerRegion " "`combiner` $reductionRegion " "( `atomic` $atomicReductionRegion^ )? " - "( `cleanup` $cleanupRegion^ )? "; + "( `cleanup` $cleanupRegion^ )? " + "( `data_ptr_ptr` $dataPtrPtrRegion^ )? "; let extraClassDeclaration = [{ BlockArgument getAllocMoldArg() { @@ -2056,6 +2068,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, auto ®ion = getCleanupRegion(); return region.empty() ? nullptr : region.getArgument(0); } + BlockArgument getDataPtrPtrRegionArg() { + auto ®ion = getDataPtrPtrRegion(); + return region.empty() ? nullptr : region.getArgument(0); + } PointerLikeType getAccumulatorType() { if (getAtomicReductionRegion().empty()) diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp index 460595ba9f254..6423d49859c97 100644 --- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp +++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp @@ -188,7 +188,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable, OpBuilder::InsertionGuard guard(builder); Type type = reduce.getOperands()[reductionIndex].getType(); auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(), - "__scf_reduction", type); + "__scf_reduction", type, + /*byref_element_type=*/{}); symbolTable.insert(decl); builder.createBlock(&decl.getInitializerRegion(), diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 98775c2d18bd4..0378033b55b38 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -953,6 +953,9 @@ using OwningAtomicReductionGen = std::function<llvm::OpenMPIRBuilder::InsertPointOrErrorTy( llvm::OpenMPIRBuilder::InsertPointTy, llvm::Type *, llvm::Value *, llvm::Value *)>; +using OwningDataPtrPtrReductionGen = + std::function<llvm::OpenMPIRBuilder::InsertPointOrErrorTy( + llvm::OpenMPIRBuilder::InsertPointTy, llvm::Value *, llvm::Value *&)>; } // namespace /// Create an OpenMPIRBuilder-compatible reduction generator for the given @@ -1017,6 +1020,35 @@ makeAtomicReductionGen(omp::DeclareReductionOp decl, return atomicGen; } +/// Create an OpenMPIRBuilder-compatible `data_ptr_ptr` reduction generator for +/// the given reduction declaration. The generator uses `builder` but ignores +/// its insertion point. Returns null if there is no `data_ptr_ptr` region +/// available in the reduction declaration. +static OwningDataPtrPtrReductionGen +makeRefDataPtrGen(omp::DeclareReductionOp decl, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, bool isByRef) { + if (!isByRef) + return OwningDataPtrPtrReductionGen(); + + OwningDataPtrPtrReductionGen refDataPtrGen = + [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint, + llvm::Value *byRefVal, llvm::Value *&result) mutable + -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { + moduleTranslation.mapValue(decl.getDataPtrPtrRegionArg(), byRefVal); + builder.restoreIP(insertPoint); + SmallVector<llvm::Value *> phis; + if (failed(inlineConvertOmpRegions(decl.getDataPtrPtrRegion(), + "omp.data_ptr_ptr.body", builder, + moduleTranslation, &phis))) + return llvm::createStringError( + "failed to inline `data_ptr_ptr` region of `omp.declare_reduction`"); + result = llvm::getSingleElement(phis); + return builder.saveIP(); + }; + + return refDataPtrGen; +} + /// Converts an OpenMP 'ordered' operation into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1320,8 +1352,10 @@ static void collectReductionInfo( SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls, SmallVectorImpl<OwningReductionGen> &owningReductionGens, SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens, + SmallVector<OwningDataPtrPtrReductionGen> &owningDataPtrPtrReductionGens, const ArrayRef<llvm::Value *> privateReductionVariables, - SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) { + SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos, + ArrayRef<bool> isByRef) { unsigned numReductions = loop.getNumReductionVars(); for (unsigned i = 0; i < numReductions; ++i) { @@ -1329,6 +1363,8 @@ static void collectReductionInfo( makeReductionGen(reductionDecls[i], builder, moduleTranslation)); owningAtomicReductionGens.push_back( makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); + owningDataPtrPtrReductionGens.push_back(makeRefDataPtrGen( + reductionDecls[i], builder, moduleTranslation, isByRef[i])); } // Collect the reduction information. @@ -1339,12 +1375,28 @@ static void collectReductionInfo( atomicGen = owningAtomicReductionGens[i]; llvm::Value *variable = moduleTranslation.lookupValue(loop.getReductionVars()[i]); + mlir::Type allocatedType; + reductionDecls[i].getAllocRegion().walk([&](mlir::Operation *op) { + if (auto alloca = mlir::dyn_cast<LLVM::AllocaOp>(op)) { + allocatedType = alloca.getElemType(); + return mlir::WalkResult::interrupt(); + } + + return mlir::WalkResult::advance(); + }); + reductionInfos.push_back( {moduleTranslation.convertType(reductionDecls[i].getType()), variable, privateReductionVariables[i], /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, owningReductionGens[i], - /*ReductionGenClang=*/nullptr, atomicGen}); + /*ReductionGenClang=*/nullptr, atomicGen, + owningDataPtrPtrReductionGens[i], + allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr, + reductionDecls[i].getByrefElementType() + ? moduleTranslation.convertType( + *reductionDecls[i].getByrefElementType()) + : nullptr}); } } @@ -1402,7 +1454,8 @@ static LogicalResult createReductionsAndCleanup( SmallVector<OwningReductionGen> owningReductionGens; SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; - SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; + SmallVector<OwningDataPtrPtrReductionGen> owningReductionGenRefDataPtrGens; + SmallVector<llvm::OpenMPIRBuilder::ReductionInfo, 2> reductionInfos; llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); @@ -1410,7 +1463,8 @@ static LogicalResult createReductionsAndCleanup( // ReductionInfo only accepts references to the generators. collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + owningReductionGenRefDataPtrGens, + privateReductionVariables, reductionInfos, isByRef); // The call to createReductions below expects the block to have a // terminator. Create an unreachable instruction to serve as terminator @@ -2739,10 +2793,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, // Collect reduction info SmallVector<OwningReductionGen> owningReductionGens; SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens; - SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos; + SmallVector<OwningDataPtrPtrReductionGen> + owningReductionGenRefDataPtrGens; + SmallVector<llvm::OpenMPIRBuilder::ReductionInfo, 2> reductionInfos; collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + owningReductionGenRefDataPtrGens, + privateReductionVariables, reductionInfos, isByRef); // Move to region cont block builder.SetInsertPoint((*regionBlock)->getTerminator()); diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir new file mode 100644 index 0000000000000..df606150b760a --- /dev/null +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -0,0 +1,97 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QFfooEi_private_i32 : i32 + omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg1 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %7 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr + %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + %12 = llvm.load %9 : !llvm.ptr -> f32 + %13 = llvm.load %11 : !llvm.ptr -> f32 + %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath<contract>} : f32 + llvm.store %14, %9 : f32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } data_ptr_ptr { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + omp.yield(%0 : !llvm.ptr) + } + + llvm.func @foo_() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"} + omp.target map_entries(%10 -> %arg0 : !llvm.ptr) { + %13 = llvm.mlir.constant(1000 : i32) : i32 + %14 = llvm.mlir.constant(1 : i32) : i32 + omp.parallel { + omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg4 : !llvm.ptr) { + omp.loop_nest (%arg5) : i32 = (%14) to (%13) inclusive step (%14) { + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} { +// CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5) +// CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5) +// CHECK: %[[RED_ELEM_1:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr + +// CHECK: %[[SHUFFLE_ELEM:.*]] = alloca float, align 4, addrspace(5) +// CHECK: %[[REMOTE_RED_LIST_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[REMOTE_RED_LIST]] to ptr + +// CHECK: %[[REMOTE_RED_LIST_ELEM0:.*]] = getelementptr inbounds [1 x ptr], ptr %[[REMOTE_RED_LIST_ASCAST]], i64 0, i64 0 + +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: %[[SHUFFLE_RES:.*]] = call i32 @__kmpc_shuffle_int32({{.*}}) +// CHECK: store i32 %[[SHUFFLE_RES]], ptr %[[SHUFFLE_ELEM_ASCAST]], align 4 + +// CHECK: %[[RED_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr +// CHECK: %[[RED_ALLOC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_ASCAST]], i32 0, i32 0 +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: store ptr %[[SHUFFLE_ELEM_ASCAST]], ptr %[[RED_ALLOC_PTR]], align 8 +// CHECK: store ptr %[[RED_ELEM_1]], ptr %[[REMOTE_RED_LIST_ELEM0]], align 8 +// CHECK: } + +// CHECK: define {{.*}} @_omp_reduction_inter_warp_copy_func({{.*}}) {{.*}} { +// CHECK: %[[WARP_MASTER_CMP:.*]] = icmp eq i32 %nvptx_lane_id, 0 +// CHECK: br i1 %[[WARP_MASTER_CMP]], label %[[WARP_MASTER_BB:.*]], label %{{.*}} + +// CHECK: [[WARP_MASTER_BB]]: +// CHECK: %[[WARP_RESULT_PTR:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[WARP_RESULT:.*]] = load ptr, ptr %[[WARP_RESULT_PTR]], align 8 +// CHECK: %[[ALLOC_MEM_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[WARP_RESULT]], i32 0, i32 0 +// CHECK: %[[ALLOC_MEM:.*]] = load ptr, ptr %[[ALLOC_MEM_PTR]], align 8 +// CHECK: %[[WARP_TRANSFER_SLOT:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %nvptx_warp_id +// CHECK: %[[WARP_RED_RES:.*]] = load i32, ptr %[[ALLOC_MEM]], align 4 +// CHECK: store volatile i32 %[[WARP_RED_RES]], ptr addrspace(3) %[[WARP_TRANSFER_SLOT]], align 4 +// CHECK: } diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir index 87ff0ba786648..2a3628b4cee03 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir @@ -7,7 +7,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : llvm.func @bar() {} llvm.func @baz() {} - omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc { + omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr attributes {byref_element_type = !llvm.array<5 x f32>} alloc { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5> %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr @@ -23,7 +23,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ^bb3: // pred: ^bb1 llvm.call @baz() : () -> () omp.yield(%arg0 : !llvm.ptr) + } data_ptr_ptr { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + omp.yield(%0 : !llvm.ptr) } + llvm.func @foo_() { %c1 = llvm.mlir.constant(1 : i64) : i64 %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5> @@ -67,9 +72,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : // CHECK: br label %[[CONT_BB:.*]] // CHECK: [[CONT_BB]]: -// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ] -// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8 -// CHECK-NEXT: br label %.omp.reduction.done +// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %{{.*}}, %{{.*}} ] // CHECK: } // CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 { diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir index b8b7c780a74d0..8950db3fc48aa 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir @@ -109,19 +109,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]] // CHECK-NEXT: store double %[[FINAL_RESULT0]] -// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]] // CHECK-NEXT: store double %[[FINAL_RESULT1]] -// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]] // CHECK-NEXT: store float %[[FINAL_RESULT2]] -// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]] // CHECK-NEXT: store float %[[FINAL_RESULT3]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir index 9aba72dabf13c..b7cb1026967f3 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir @@ -59,8 +59,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_barrier // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir index dc22fe11666cf..36eb280dfcfa2 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir @@ -62,8 +62,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]] _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
