https://github.com/robertvirany created https://github.com/llvm/llvm-project/pull/205661
Lower CUDA Device `sqrtf` through `__builtin_sqrtf` instead of the libdevice `__nv_sqrtf` wrapper. This lets the existing NVPTX lowering for `llvm.sqrt.f32` choose between `sqrt.rn.f32` by default and `sqrt.approx.f32` under `-fapprox-func`. Fixes #131749 Includes tests in clang/test/CodeGenCUDA/sqrtf-precise.cu >From 9f91ec07ee572d62a10d277ec96eee7e391fd1f5 Mon Sep 17 00:00:00 2001 From: Robert Virany <[email protected]> Date: Wed, 24 Jun 2026 15:49:37 -0400 Subject: [PATCH] [CUDA] Lower device sqrtf through builtin sqrt Lower CUDA device sqrtf through __builtin_sqrtf instead of the libdevice __nv_sqrtf wrapper. This lets NVPTX select precise or approximate sqrt from the LLVM sqrt intrinsic and fast-math flags. Fixes #131749 --- clang/lib/Headers/__clang_cuda_math.h | 2 +- clang/test/CodeGenCUDA/sqrtf-precise.cu | 27 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenCUDA/sqrtf-precise.cu diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h index 44c6e9a4e48d1..cf361e6109194 100644 --- a/clang/lib/Headers/__clang_cuda_math.h +++ b/clang/lib/Headers/__clang_cuda_math.h @@ -315,7 +315,7 @@ __DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); } __DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); } __DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); } __DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); } -__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); } +__DEVICE__ float sqrtf(float __a) { return __builtin_sqrtf(__a); } __DEVICE__ double tan(double __a) { return __nv_tan(__a); } __DEVICE__ float tanf(float __a) { return __nv_tanf(__a); } __DEVICE__ double tanh(double __a) { return __nv_tanh(__a); } diff --git a/clang/test/CodeGenCUDA/sqrtf-precise.cu b/clang/test/CodeGenCUDA/sqrtf-precise.cu new file mode 100644 index 0000000000000..5ba72c376b71a --- /dev/null +++ b/clang/test/CodeGenCUDA/sqrtf-precise.cu @@ -0,0 +1,27 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda \ +// RUN: -target-cpu sm_75 -O2 -S -o - %s \ +// RUN: -include __clang_cuda_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/../Headers/Inputs/include \ +// RUN: | FileCheck --check-prefix=RN %s + +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda \ +// RUN: -target-cpu sm_75 -O2 -fapprox-func -S -o - %s \ +// RUN: -include __clang_cuda_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/../Headers/Inputs/include \ +// RUN: | FileCheck --check-prefix=APPROX %s + +#include <math.h> + +// RN-LABEL: .func{{.*}}_Z1ff +// RN: sqrt.rn.f32 +// RN-NOT: sqrt.approx.f32 +// +// APPROX-LABEL: .func{{.*}}_Z1ff +// APPROX: sqrt.approx.f32 +__device__ float f(float x) { + return sqrtf(x); +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
