[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-07-20 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC337639: [HIP] Support -fcuda-flush-denormals-to-zero for 
amdgcn (authored by yaxunl, committed by ).

Repository:
  rC Clang

https://reviews.llvm.org/D48287

Files:
  include/clang/Basic/LangOptions.def
  lib/CodeGen/CGCall.cpp
  lib/CodeGen/CodeGenModule.cpp
  lib/Frontend/CompilerInvocation.cpp
  test/CodeGenCUDA/flush-denormals.cu

Index: include/clang/Basic/LangOptions.def
===
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -209,7 +209,6 @@
 LANGOPT(CUDAIsDevice  , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
-LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,18 +5,33 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
 // attribute set to "true" when we compile CUDA device code with
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option
+// -fcuda-flush-denormals-to-zero.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+ (Args.hasArg(OPT_fcuda_is_device) &&
+  Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
   Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =
@@ -2191,9 +2193,6 @@
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
 Opts.CUDAHostDeviceConstexpr = 0;
 
-  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
-Opts.CUDADeviceFlushDenormalsToZero = 1;
-
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
 Opts.CUDADeviceApproxTranscendentals = 1;
 
Index: lib/CodeGen/CodeGenModule.cpp
===
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -526,7 +526,7 @@
 // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
 // property.)
 getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
-  LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
+  CodeGenOpts.FlushDenorm ? 1 : 0);
   }
 
   // Emit OpenCL specific module metadata: OpenCL/SPIR version.
Index: lib/CodeGen/CGCall.cpp
===
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -1800,7 +1800,7 @@
 FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
 
 // Respect -fcuda-flush-denormals-to-zero.
-if 

[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-07-20 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added a comment.
This revision is now accepted and ready to land.

Thank you. That should work.


https://reviews.llvm.org/D48287



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-07-14 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 155580.
yaxunl added a comment.

Replace LangOpts.CUDADeviceFlushDenormalsToZero with CodeGenOpts.FlushDenorm.


https://reviews.llvm.org/D48287

Files:
  include/clang/Basic/LangOptions.def
  lib/CodeGen/CGCall.cpp
  lib/CodeGen/CodeGenModule.cpp
  lib/Frontend/CompilerInvocation.cpp
  test/CodeGenCUDA/flush-denormals.cu

Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,18 +5,33 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
 // attribute set to "true" when we compile CUDA device code with
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option
+// -fcuda-flush-denormals-to-zero.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+ (Args.hasArg(OPT_fcuda_is_device) &&
+  Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
   Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =
@@ -2186,9 +2188,6 @@
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
 Opts.CUDAHostDeviceConstexpr = 0;
 
-  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
-Opts.CUDADeviceFlushDenormalsToZero = 1;
-
   if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
 Opts.CUDADeviceApproxTranscendentals = 1;
 
Index: lib/CodeGen/CodeGenModule.cpp
===
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -526,7 +526,7 @@
 // floating point values to 0.  (This corresponds to its "__CUDA_FTZ"
 // property.)
 getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
-  LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
+  CodeGenOpts.FlushDenorm ? 1 : 0);
   }
 
   // Emit OpenCL specific module metadata: OpenCL/SPIR version.
Index: lib/CodeGen/CGCall.cpp
===
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -1798,7 +1798,7 @@
 FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
 
 // Respect -fcuda-flush-denormals-to-zero.
-if (getLangOpts().CUDADeviceFlushDenormalsToZero)
+if (CodeGenOpts.FlushDenorm)
   FuncAttrs.addAttribute("nvptx-f32ftz", "true");
   }
 }
Index: include/clang/Basic/LangOptions.def
===
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -209,7 +209,6 @@
 LANGOPT(CUDAIsDevice  , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
-LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 

[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-07-13 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D48287#1138262, @tra wrote:

> Using OpenCL's flag for the purpose adds a *third* way we handle denormals 
> flushing in clang. Now it would be HIP (which is CUDA-like) using OpenCL's 
> flag for denormals instead of CUDA's one.
>  You could change AMDGPUTargetInfo::adjustTargetOptions() to use 
> CGOpts.getLangOpts().CUDADeviceFlushDenormalsToZero instead. That would at 
> least make HIP and CUDA do the same thing.
>
> I think it would work better if we could coalesce 
> CUDADeviceFlushDenormalsToZero and CodeGenOpts.FlushDenorm and, maybe move 
> the flag to LangOpts , so we could use LangOpts.CUDAIsDevice.


Sorry for the delay.

CGOpts does not have member function getLangOpts(). It seems whereever we need 
to refer to LangOpts.CUDADeviceFlushDenormalsToZero, we can use 
Opts.FlushDenorm, but not true vice versa. Therefore if we want a unified 
option, Opts.FlushDenorm is a better choice.


https://reviews.llvm.org/D48287



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-06-20 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Using OpenCL's flag for the purpose adds a *third* way we handle denormals 
flushing in clang. Now it would be HIP (which is CUDA-like) using OpenCL's flag 
for denormals instead of CUDA's one.
You could change AMDGPUTargetInfo::adjustTargetOptions() to use 
CGOpts.getLangOpts().CUDADeviceFlushDenormalsToZero instead. That would at 
least make HIP and CUDA do the same thing.

I think it would work better if we could coalesce 
CUDADeviceFlushDenormalsToZero and CodeGenOpts.FlushDenorm and, maybe move the 
flag to LangOpts , so we could use LangOpts.CUDAIsDevice.


https://reviews.llvm.org/D48287



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-06-18 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 151748.
yaxunl added a comment.

Correct comments in test.


https://reviews.llvm.org/D48287

Files:
  lib/Frontend/CompilerInvocation.cpp
  test/CodeGenCUDA/flush-denormals.cu


Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,18 +5,33 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
 // attribute set to "true" when we compile CUDA device code with
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option
+// -fcuda-flush-denormals-to-zero.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+ (Args.hasArg(OPT_fcuda_is_device) &&
+  Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
   Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =


Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,18 +5,33 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
 // attribute set to "true" when we compile CUDA device code with
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// AMDGCN targets without fast FMAF (e.g. gfx803) always have +fp32-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option
+// -fcuda-flush-denormals-to-zero.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
  

[PATCH] D48287: [HIP] Support -fcuda-flush-denormals-to-zero for amdgcn

2018-06-18 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added reviewers: b-sumner, tra.
yaxunl added a reviewer: scchan.

https://reviews.llvm.org/D48287

Files:
  lib/Frontend/CompilerInvocation.cpp
  test/CodeGenCUDA/flush-denormals.cu


Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,6 +5,13 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
@@ -12,11 +19,17 @@
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option -fcuda-is-device.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+ (Args.hasArg(OPT_fcuda_is_device) &&
+  Args.hasArg(OPT_fcuda_flush_denormals_to_zero));
   Opts.CorrectlyRoundedDivSqrt =
   Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =


Index: test/CodeGenCUDA/flush-denormals.cu
===
--- test/CodeGenCUDA/flush-denormals.cu
+++ test/CodeGenCUDA/flush-denormals.cu
@@ -5,6 +5,13 @@
 // RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
 // RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
 
+// RUN: %clang_cc1 -fcuda-is-device -x hip \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDNOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -x hip -fcuda-flush-denormals-to-zero \
+// RUN:   -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix AMDFTZ
+
 #include "Inputs/cuda.h"
 
 // Checks that device function calls get emitted with the "ntpvx-f32ftz"
@@ -12,11 +19,17 @@
 // -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
 // or absence of -fcuda-flush-denormals-to-zero in a module flag.
 
+// AMDGCN targets always have +fp64-fp16-denormals.
+// For AMDGCN target with fast FMAF (e.g. gfx900), it has +fp32-denormals
+// by default and -fp32-denormals when there is option -fcuda-is-device.
+
 // CHECK-LABEL: define void @foo() #0
 extern "C" __device__ void foo() {}
 
 // FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
 // NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+// AMDNOFTZ: attributes #0 = {{.*}}+fp32-denormals{{.*}}+fp64-fp16-denormals
+// AMDFTZ: attributes #0 = {{.*}}+fp64-fp16-denormals{{.*}}-fp32-denormals
 
 // FTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]}
 // FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
Index: lib/Frontend/CompilerInvocation.cpp
===
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -690,7 +690,9 @@
 Args.hasArg(OPT_cl_unsafe_math_optimizations) ||
 Args.hasArg(OPT_cl_fast_relaxed_math));
   Opts.Reassociate = Args.hasArg(OPT_mreassociate);
-  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero);
+  Opts.FlushDenorm = Args.hasArg(OPT_cl_denorms_are_zero) ||
+ (Args.hasArg(OPT_fcuda_is_device) &&
+