llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-driver Author: Nicole Aschenbrenner (nicebert) <details> <summary>Changes</summary> This patch series adds a framework of OpenMP GPU optimization flags to enable more efficient code generation for GPU offloading. The series consists of three patches: 1. Add negative flag variants (-fno-*) for existing assume options to allow explicit disabling of optimizations. 2. Add -fopenmp-target-ignore-env-vars flag to indicate that OpenMP runtime can ignore environment variables during code generation, enabling optimizations like skipping runtime checks and eliminating conditional branches. 3. Add -fopenmp-target-fast meta-flag that implies the above optimization flags. This convenience flag is automatically enabled by -Ofast and provides a simple interface for aggressive GPU optimizations. The flags benefit all GPU targets (NVPTX, AMDGPU, Intel GPU) by providing a standard way to enable common GPU optimization patterns. Individual flags can be selectively overridden while keeping others enabled. --- Full diff: https://github.com/llvm/llvm-project/pull/178914.diff 4 Files Affected: - (modified) clang/include/clang/Basic/LangOptions.def (+1) - (modified) clang/include/clang/Options/Options.td (+19) - (modified) clang/lib/Driver/ToolChains/Clang.cpp (+48-2) - (added) clang/test/Driver/openmp-target-fast-flag.c (+48) ``````````diff diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index ba12e522f331f..a2cc47a6528c4 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -239,6 +239,7 @@ LANGOPT(OpenMPThreadSubscription , 1, 0, NotCompatible, "Assume work-shared loo LANGOPT(OpenMPTeamSubscription , 1, 0, NotCompatible, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, NotCompatible, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(OpenMPNoNestedParallelism , 1, 0, NotCompatible, "Assume that no thread in a parallel region will encounter a parallel region") +LANGOPT(OpenMPTargetIgnoreEnvVars, 1, 0, NotCompatible, "Assume that the OpenMP runtime can ignore environment variables during code generation for GPU offload") LANGOPT(OpenMPOffloadMandatory , 1, 0, NotCompatible, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(OpenMPForceUSM , 1, 0, NotCompatible, "Enable OpenMP unified shared memory mode via compiler.") LANGOPT(NoGPULib , 1, 0, NotCompatible, "Indicate a build without the standard GPU libraries.") diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 421208a812bbc..2080cfa5fc091 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -3986,9 +3986,13 @@ def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume- def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>; +def fno_openmp_assume_no_thread_state : Flag<["-"], "fno-openmp-assume-no-thread-state">, + HelpText<"Assert that a thread in a parallel region may modify an ICV">; def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, HelpText<"Assert no nested parallel regions in the GPU">, MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>; +def fno_openmp_assume_no_nested_parallelism : Flag<["-"], "fno-openmp-assume-no-nested-parallelism">, + HelpText<"Assert that a nested parallel region may be used in the GPU">; } // let Group = f_Group } // let Visibility = [ClangOption, CC1Option, FC1Option] @@ -4012,6 +4016,21 @@ def fopenmp_target_new_runtime : Flag<["-"], "fopenmp-target-new-runtime">, Group<f_Group>, Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>; def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">, Group<f_Group>, Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>; +def fopenmp_target_ignore_env_vars : Flag<["-"], "fopenmp-target-ignore-env-vars">, + Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, + Visibility<[ClangOption, CC1Option]>, + HelpText<"Assume that the OpenMP runtime can ignore environment variables during code generation for GPU offload">, + MarshallingInfoFlag<LangOpts<"OpenMPTargetIgnoreEnvVars">>; +def fno_openmp_target_ignore_env_vars : Flag<["-"], "fno-openmp-target-ignore-env-vars">, + Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, + Visibility<[ClangOption, CC1Option]>; +def fopenmp_target_fast : Flag<["-"], "fopenmp-target-fast">, + Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, + Visibility<[ClangOption, CC1Option]>, + HelpText<"Convenience flag to enable aggressive OpenMP GPU optimizations">; +def fno_openmp_target_fast : Flag<["-"], "fno-openmp-target-fast">, + Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, + Visibility<[ClangOption, CC1Option]>; defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse", LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse, PosFlag<SetTrue, [], [ClangOption, CC1Option]>, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 0293b04217673..4c90d7cd33d4e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6675,6 +6675,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_offload_via_llvm, false) && (JA.isDeviceOffloading(Action::OFK_None) || JA.isDeviceOffloading(Action::OFK_OpenMP))) { + + // Determine if target-fast optimizations should be enabled + bool TargetFastUsed = Args.hasFlag(options::OPT_fopenmp_target_fast, + options::OPT_fno_openmp_target_fast, + OFastEnabled); switch (D.getOpenMPRuntime(Args)) { case Driver::OMPRT_OMP: case Driver::OMPRT_IOMP5: @@ -6725,10 +6730,51 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); - if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) + + // Handle -fopenmp-target-fast + if (Arg *A = Args.getLastArg(options::OPT_fopenmp_target_fast, + options::OPT_fno_openmp_target_fast)) { + if (A->getOption().matches(options::OPT_fopenmp_target_fast)) + CmdArgs.push_back("-fopenmp-target-fast"); + else + CmdArgs.push_back("-fno-openmp-target-fast"); + } else if (OFastEnabled) { + CmdArgs.push_back("-fopenmp-target-fast"); + } + + // Handle -fopenmp-target-ignore-env-vars (implied by target-fast) + if (Arg *A = Args.getLastArg(options::OPT_fopenmp_target_ignore_env_vars, + options::OPT_fno_openmp_target_ignore_env_vars)) { + if (A->getOption().matches(options::OPT_fopenmp_target_ignore_env_vars)) + CmdArgs.push_back("-fopenmp-target-ignore-env-vars"); + else + CmdArgs.push_back("-fno-openmp-target-ignore-env-vars"); + } else if (TargetFastUsed) { + CmdArgs.push_back("-fopenmp-target-ignore-env-vars"); + } + + // Handle -fopenmp-assume-no-thread-state (implied by target-fast) + if (Arg *A = Args.getLastArg(options::OPT_fopenmp_assume_no_thread_state, + options::OPT_fno_openmp_assume_no_thread_state)) { + if (A->getOption().matches(options::OPT_fopenmp_assume_no_thread_state)) + CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + else + CmdArgs.push_back("-fno-openmp-assume-no-thread-state"); + } else if (TargetFastUsed) { CmdArgs.push_back("-fopenmp-assume-no-thread-state"); - if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism)) + } + + // Handle -fopenmp-assume-no-nested-parallelism (implied by target-fast) + if (Arg *A = Args.getLastArg(options::OPT_fopenmp_assume_no_nested_parallelism, + options::OPT_fno_openmp_assume_no_nested_parallelism)) { + if (A->getOption().matches(options::OPT_fopenmp_assume_no_nested_parallelism)) + CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); + else + CmdArgs.push_back("-fno-openmp-assume-no-nested-parallelism"); + } else if (TargetFastUsed) { CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); + } + if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) CmdArgs.push_back("-fopenmp-offload-mandatory"); if (Args.hasArg(options::OPT_fopenmp_force_usm)) diff --git a/clang/test/Driver/openmp-target-fast-flag.c b/clang/test/Driver/openmp-target-fast-flag.c new file mode 100644 index 0000000000000..703df0efaeedd --- /dev/null +++ b/clang/test/Driver/openmp-target-fast-flag.c @@ -0,0 +1,48 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a %s -O0 2>&1 \ +// RUN: | FileCheck -check-prefixes=DefaultTFast,DefaultEnV,DefaultTState,DefaultNoNestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O0 -fopenmp-target-fast %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=TFast,EnV,TState,NestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O3 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=O3,DefaultTFast,DefaultEnV,DefaultTState,DefaultNoNestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O3 -fno-openmp-target-fast %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=O3,NoTFast,DefaultEnV,DefaultTState,DefaultNoNestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -Ofast %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=OFast,TFast,EnV,TState,NestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -Ofast -fno-openmp-target-fast %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=OFast,NoTFast,DefaultEnV,DefaultTState,DefaultNoNestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -fopenmp-target-fast -fno-openmp-target-ignore-env-vars %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=TFast,NoEnV,TState,NestParallel %s + +// RUN: %clang -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a -O0 -fno-openmp-target-fast -fopenmp-target-fast %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=TFast,EnV,TState,NestParallel %s + +// O3: -O3 +// OFast: -Ofast + +// TFast: "-fopenmp-target-fast" +// TFast-NOT: "-fno-openmp-target-fast" +// NoTFast: "-fno-openmp-target-fast" +// NoTFast-NOT: "-fopenmp-target-fast" +// DefaultTFast-NOT: {{"-f(no-)?openmp-target-fast"}} + +// EnV: "-fopenmp-target-ignore-env-vars" +// EnV-NOT: "-fno-openmp-target-ignore-env-vars" +// NoEnV: "-fno-openmp-target-ignore-env-vars" +// NoEnV-NOT: "-fopenmp-target-ignore-env-vars" +// DefaultEnV-NOT: {{"-f(no-)?openmp-target-ignore-env-vars"}} + +// TState: "-fopenmp-assume-no-thread-state" +// TState-NOT: "-fno-openmp-assume-no-thread-state" +// DefaultTState-NOT: {{"-f(no-)?openmp-assume-no-thread-state"}} + +// NestParallel: "-fopenmp-assume-no-nested-parallelism" +// NestParallel-NOT: "-fno-openmp-assume-no-nested-parallelism" +// DefaultNoNestParallel-NOT: {{"-f(-no-)?openmp-assume-no-nested-parallelism"}} `````````` </details> https://github.com/llvm/llvm-project/pull/178914 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
