[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
This revision was automatically updated to reflect the committed changes. Closed by commit rC337639: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn (authored by yaxunl, committed by ). Repository: rC Clang https://reviews.llvm.org/D48287 Files: include/clang/Basic/LangOptions.def lib/CodeGen/CGCall.cpp lib/CodeGen/CodeGenModule.cpp lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/flush-denormals.cu Index: include/clang/Basic/LangOptions.def === --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -209,7 +209,6 @@ LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code") LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") -LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code") Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,18 +5,33 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" // attribute set to "true" when we compile CUDA device code with // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option +// -fcuda-flush-denormals-to-zero. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero); + Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || + (Args.hasArg(OPT_fcuda_is_device) && + Args.hasArg(OPT_fcuda_flush_denormals_to_zero)); Opts.CorrectlyRoundedDivSqrt = Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt); Opts.UniformWGSize = @@ -2191,9 +2193,6 @@ if (Args.hasArg(OPT_fno_cuda_host_device_constexpr)) Opts.CUDAHostDeviceConstexpr = 0; - if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero)) -Opts.CUDADeviceFlushDenormalsToZero = 1; - if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; Index: lib/CodeGen/CodeGenModule.cpp === --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -526,7 +526,7 @@ // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0); + CodeGenOpts.FlushDenorm ? 1 : 0); } // Emit OpenCL specific module metadata: OpenCL/SPIR version. Index: lib/CodeGen/CGCall.cpp === --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -1800,7 +1800,7 @@ FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); // Respect -fcuda-flush-denormals-to-zero. -if
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
tra accepted this revision. tra added a comment. This revision is now accepted and ready to land. Thank you. That should work. https://reviews.llvm.org/D48287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
yaxunl updated this revision to Diff 155580. yaxunl added a comment. Replace LangOpts.CUDADeviceFlushDenormalsToZero with CodeGenOpts.FlushDenorm. https://reviews.llvm.org/D48287 Files: include/clang/Basic/LangOptions.def lib/CodeGen/CGCall.cpp lib/CodeGen/CodeGenModule.cpp lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/flush-denormals.cu Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,18 +5,33 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" // attribute set to "true" when we compile CUDA device code with // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option +// -fcuda-flush-denormals-to-zero. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero); + Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || + (Args.hasArg(OPT_fcuda_is_device) && + Args.hasArg(OPT_fcuda_flush_denormals_to_zero)); Opts.CorrectlyRoundedDivSqrt = Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt); Opts.UniformWGSize = @@ -2186,9 +2188,6 @@ if (Args.hasArg(OPT_fno_cuda_host_device_constexpr)) Opts.CUDAHostDeviceConstexpr = 0; - if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero)) -Opts.CUDADeviceFlushDenormalsToZero = 1; - if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals)) Opts.CUDADeviceApproxTranscendentals = 1; Index: lib/CodeGen/CodeGenModule.cpp === --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -526,7 +526,7 @@ // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0); + CodeGenOpts.FlushDenorm ? 1 : 0); } // Emit OpenCL specific module metadata: OpenCL/SPIR version. Index: lib/CodeGen/CGCall.cpp === --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -1798,7 +1798,7 @@ FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); // Respect -fcuda-flush-denormals-to-zero. -if (getLangOpts().CUDADeviceFlushDenormalsToZero) +if (CodeGenOpts.FlushDenorm) FuncAttrs.addAttribute("nvptx-f32ftz", "true"); } } Index: include/clang/Basic/LangOptions.def === --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -209,7 +209,6 @@ LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code") LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") -LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
yaxunl added a comment. In https://reviews.llvm.org/D48287#1138262, @tra wrote: > Using OpenCL's flag for the purpose adds a *third* way we handle denormals > flushing in clang. Now it would be HIP (which is CUDA-like) using OpenCL's > flag for denormals instead of CUDA's one. > You could change AMDGPUTargetInfo::adjustTargetOptions() to use > CGOpts.getLangOpts().CUDADeviceFlushDenormalsToZero instead. That would at > least make HIP and CUDA do the same thing. > > I think it would work better if we could coalesce > CUDADeviceFlushDenormalsToZero and CodeGenOpts.FlushDenorm and, maybe move > the flag to LangOpts , so we could use LangOpts.CUDAIsDevice. Sorry for the delay. CGOpts does not have member function getLangOpts(). It seems whereever we need to refer to LangOpts.CUDADeviceFlushDenormalsToZero, we can use Opts.FlushDenorm, but not true vice versa. Therefore if we want a unified option, Opts.FlushDenorm is a better choice. https://reviews.llvm.org/D48287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
tra added a comment. Using OpenCL's flag for the purpose adds a *third* way we handle denormals flushing in clang. Now it would be HIP (which is CUDA-like) using OpenCL's flag for denormals instead of CUDA's one. You could change AMDGPUTargetInfo::adjustTargetOptions() to use CGOpts.getLangOpts().CUDADeviceFlushDenormalsToZero instead. That would at least make HIP and CUDA do the same thing. I think it would work better if we could coalesce CUDADeviceFlushDenormalsToZero and CodeGenOpts.FlushDenorm and, maybe move the flag to LangOpts , so we could use LangOpts.CUDAIsDevice. https://reviews.llvm.org/D48287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
yaxunl updated this revision to Diff 151748. yaxunl added a comment. Correct comments in test. https://reviews.llvm.org/D48287 Files: lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/flush-denormals.cu Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,18 +5,33 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" // attribute set to "true" when we compile CUDA device code with // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option +// -fcuda-flush-denormals-to-zero. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero); + Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || + (Args.hasArg(OPT_fcuda_is_device) && + Args.hasArg(OPT_fcuda_flush_denormals_to_zero)); Opts.CorrectlyRoundedDivSqrt = Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt); Opts.UniformWGSize = Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,18 +5,33 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" // attribute set to "true" when we compile CUDA device code with // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option +// -fcuda-flush-denormals-to-zero. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math));
[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn
yaxunl created this revision. yaxunl added reviewers: b-sumner, tra. yaxunl added a reviewer: scchan. https://reviews.llvm.org/D48287 Files: lib/Frontend/CompilerInvocation.cpp test/CodeGenCUDA/flush-denormals.cu Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,6 +5,13 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" @@ -12,11 +19,17 @@ // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option -fcuda-is-device. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero); + Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || + (Args.hasArg(OPT_fcuda_is_device) && + Args.hasArg(OPT_fcuda_flush_denormals_to_zero)); Opts.CorrectlyRoundedDivSqrt = Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt); Opts.UniformWGSize = Index: test/CodeGenCUDA/flush-denormals.cu === --- test/CodeGenCUDA/flush-denormals.cu +++ test/CodeGenCUDA/flush-denormals.cu @@ -5,6 +5,13 @@ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \ // RUN: FileCheck %s -check-prefix CHECK -check-prefix FTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ +// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \ +// RUN: -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \ +// RUN: FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ + #include "Inputs/cuda.h" // Checks that device function calls get emitted with the "ntpvx-f32ftz" @@ -12,11 +19,17 @@ // -fcuda-flush-denormals-to-zero. Further, check that we reflect the presence // or absence of -fcuda-flush-denormals-to-zero in a module flag. +// AMDGCN targets always have +fp64-fp16-denormals. +// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals +// by default and -fp32-denormals when there is option -fcuda-is-device. + // CHECK-LABEL: define void @foo() #0 extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true" // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz" +// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals +// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} Index: lib/Frontend/CompilerInvocation.cpp === --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -690,7 +690,9 @@ Args.hasArg(OPT_cl_unsafe_math_optimizations) || Args.hasArg(OPT_cl_fast_relaxed_math)); Opts.Reassociate = Args.hasArg(OPT_mreassociate); - Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero); + Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) || + (Args.hasArg(OPT_fcuda_is_device) && +