[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
This revision was automatically updated to reflect the committed changes. Closed by commit rC350758: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop… (authored by gbercea, committed by ). Changed prior to commit: https://reviews.llvm.org/D55928?vs=179078=180906#toc Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 Files: docs/OpenMPSupport.rst include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Frontend/CompilerInvocation.cpp lib/Sema/SemaOpenMP.cpp test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Index: docs/OpenMPSupport.rst === --- docs/OpenMPSupport.rst +++ docs/OpenMPSupport.rst @@ -108,6 +108,16 @@ between the threads and it is user responsibility to share the required data between the threads in the parallel regions. +Collapsed loop nest counter +--- + +When using the collapse clause on a loop nest the default behaviour is to +automatically extend the representation of the loop counter to 64 bits for +the cases where the sizes of the collapsed loops are not known at compile +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-optimistic-collapse`. + + Features not supported or with limited support for Cuda devices --- Index: include/clang/Driver/Options.td === --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -1574,6 +1574,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group, + Flags<[NoArgumentUnused, HelpHidden]>; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group; def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group, Flags<[CC1Option]>; Index: include/clang/Basic/LangOptions.def === --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -207,6 +207,7 @@ LANGOPT(OpenMPHostCXXExceptions, 1, 0, "C++ exceptions handling in the host code.") LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.") LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp === --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea marked an inline comment as done. gtbercea added inline comments. Comment at: docs/OpenMPSupport.rst:120 +compile your program with the `-fopenmp-optimistic-collapse`. + + hfinkel wrote: > Can you please clarify here what happens when the loop induction variables > are already 64 bits. If any of them are already 64 bits, then we still use 64 > bits overall? > This flag is for the user to guarantee that the total size of the collapsed loops can be represented using at most 32 bits (regardless of the actual width of the individual loop induction variables). Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
hfinkel added inline comments. Comment at: docs/OpenMPSupport.rst:120 +compile your program with the `-fopenmp-optimistic-collapse`. + + Can you please clarify here what happens when the loop induction variables are already 64 bits. If any of them are already 64 bits, then we still use 64 bits overall? Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea added inline comments. Comment at: docs/OpenMPSupport.rst:119 +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-max-32bit-collapse-width`. + kkwli0 wrote: > ABataev wrote: > > -fopenmp-optimistic-collapse > How about -fopenmp-use-32bit-collapse-parameter? Alexey suggested we have no numbers in the flag name so we tried to find something that avoids that. I'm happy to give it a different name than optimistic collapse. Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
ABataev accepted this revision. ABataev added a comment. This revision is now accepted and ready to land. LG Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea updated this revision to Diff 179078. gtbercea added a comment. - Address comments. Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 Files: docs/OpenMPSupport.rst include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Frontend/CompilerInvocation.cpp lib/Sema/SemaOpenMP.cpp test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp === --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 @@ -9,11 +10,12 @@ #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -80,7 +82,7 @@ // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) @@ -214,16 +216,19 @@ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK: define internal void [[OUTL4]]( +// CHECK-32: define internal void [[OUTL4]]( +// CHECK-64: define internal void [[OUTL4]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]*
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
ABataev added inline comments. Comment at: lib/Driver/ToolChains/Clang.cpp:4429 Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); + Args.AddLastArg(CmdArgs, options::OPT_fopenmp_optimistic_collapse, + options::OPT_fno_openmp_optimistic_collapse); fno_openmp_optimistic_collapse is not a frontend option. better to check for ``` if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse, options::OPT_fno_openmp_optimistic_collapse, /*Default=*/false)) CmdArgs.push_back("-fopenmp-optimistic-collapse"); ``` here rather than in the frontend. In the frontend, you can just check for the presence of the fopenmp_optimistic_collapse flag Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea added inline comments. Comment at: include/clang/Basic/LangOptions.def:210 LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPMax32BitCollapseWidth , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") ABataev wrote: > Fix the description and the option in accordance with the option name The description accurately describes what Sema is doing. We allow an extension to up to 32 bits but no further. Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea updated this revision to Diff 179076. gtbercea marked 4 inline comments as done. gtbercea edited the summary of this revision. gtbercea added a comment. - Address comments. Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 Files: docs/OpenMPSupport.rst include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Frontend/CompilerInvocation.cpp lib/Sema/SemaOpenMP.cpp test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp === --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 @@ -9,11 +10,12 @@ #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -80,7 +82,7 @@ // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) @@ -214,16 +216,19 @@ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK: define internal void [[OUTL4]]( +// CHECK-32: define internal void [[OUTL4]]( +// CHECK-64: define internal void [[OUTL4]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
kkwli0 added inline comments. Comment at: docs/OpenMPSupport.rst:119 +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-max-32bit-collapse-width`. + ABataev wrote: > -fopenmp-optimistic-collapse How about -fopenmp-use-32bit-collapse-parameter? Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
ABataev added inline comments. Comment at: docs/OpenMPSupport.rst:119 +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-max-32bit-collapse-width`. + -fopenmp-optimistic-collapse Comment at: include/clang/Basic/LangOptions.def:210 LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPMax32BitCollapseWidth , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") Fix the description and the option in accordance with the option name Comment at: include/clang/Driver/Options.td:1577 Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_max_32bit_collapse_width : Flag<["-"], "fopenmp-max-32bit-collapse-width">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; Missed -fno... option Comment at: lib/Driver/ToolChains/Clang.cpp:4429 Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_max_32bit_collapse_width); You need to change the processing taking into account -fno... option. Comment at: lib/Frontend/CompilerInvocation.cpp:2848 + // OpenMP collapse clause. + Opts.OpenMPMax32BitCollapseWidth = + Args.hasArg(options::OPT_fopenmp_max_32bit_collapse_width) ? 1 : 0; Also, need to check for -fno... option Repository: rC Clang CHANGES SINCE LAST ACTION https://reviews.llvm.org/D55928/new/ https://reviews.llvm.org/D55928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D55928: [OpenMP] Add flag for preventing the extension to 64 bits for the collapse loop counter
gtbercea created this revision. gtbercea added reviewers: ABataev, caomhin. Herald added subscribers: cfe-commits, guansong. Introduce a compiler flag for cases when the user knows that the collapsed loop counter can be safely represented using at most 32 bits. This will prevent the emission of expensive mathematical operations (such as the div operation) on the iteration variable using 64 bits where 32 bit operations are sufficient. Repository: rC Clang https://reviews.llvm.org/D55928 Files: docs/OpenMPSupport.rst include/clang/Basic/LangOptions.def include/clang/Driver/Options.td lib/Driver/ToolChains/Clang.cpp lib/Frontend/CompilerInvocation.cpp lib/Sema/SemaOpenMP.cpp test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp === --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-max-32bit-collapse-width -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 @@ -9,11 +10,12 @@ #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -80,7 +82,7 @@ // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) @@ -214,16 +216,19 @@ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK: define internal void [[OUTL4]]( +// CHECK-32: define internal void [[OUTL4]]( +// CHECK-64: define