Author: erichkeane Date: 2025-05-09T12:54:42-07:00 New Revision: 31fd77aa51a643245f8eb277483554509b771832
URL: https://github.com/llvm/llvm-project/commit/31fd77aa51a643245f8eb277483554509b771832 DIFF: https://github.com/llvm/llvm-project/commit/31fd77aa51a643245f8eb277483554509b771832.diff LOG: [OpenACC][CIR] worker/vector lowering for combined constructs Another set of 2 line changes, but makes sure to add sufficient testing. Added: Modified: clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h clang/test/CIR/CodeGenOpenACC/combined.cpp Removed: ################################################################################ diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h index 97624ea509ade..1661e3d0c4b62 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h @@ -465,6 +465,8 @@ class OpenACCClauseCIREmitter final else operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues); + } else if constexpr (isCombinedType<OpTy>) { + applyToLoopOp(clause); } else { // TODO: When we've implemented this for everything, switch this to an // unreachable. Combined constructs remain. @@ -481,6 +483,8 @@ class OpenACCClauseCIREmitter final else operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues); + } else if constexpr (isCombinedType<OpTy>) { + applyToLoopOp(clause); } else { // TODO: When we've implemented this for everything, switch this to an // unreachable. Combined constructs remain. diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp index b9fb82af46cca..d79f964f19ebf 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp @@ -401,4 +401,176 @@ extern "C" void acc_combined(int N) { // CHECK-NEXT: } loc // CHECK-NEXT: acc.terminator // CHECK-NEXT: } loc + +#pragma acc kernels loop worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) worker { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker(N) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker device_type(nvidia, radeon) worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker device_type(nvidia, radeon) worker(N) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker(N + 1) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop device_type(nvidia, radeon) worker(num:N + 1) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) { + // CHECK: acc.terminator + // CHECK-NEXT: } loc + + +#pragma acc kernels loop worker vector device_type(nvidia) worker vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>]) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK: acc.loop combined(kernels) vector { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop vector(N) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop vector device_type(nvidia, radeon) vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector(N + 1) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop device_type(nvidia, radeon) vector(length:N + 1) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32 + // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK: acc.terminator + // CHECK-NEXT: } loc } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits