[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-07-08 Thread Johannes Doerfert via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rGd999cbc98832: [OpenMP] Initial support for std::complex in 
target regions (authored by jdoerfert).

Changed prior to commit:
  https://reviews.llvm.org/D80897?vs=276053=276584#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897

Files:
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_complex_builtins.h
  clang/lib/Headers/__clang_cuda_math.h
  clang/lib/Headers/openmp_wrappers/complex
  clang/lib/Headers/openmp_wrappers/complex.h
  clang/test/Headers/Inputs/include/cmath
  clang/test/Headers/Inputs/include/complex
  clang/test/Headers/Inputs/include/cstdlib
  clang/test/Headers/nvptx_device_math_complex.c
  clang/test/Headers/nvptx_device_math_complex.cpp

Index: clang/test/Headers/nvptx_device_math_complex.cpp
===
--- /dev/null
+++ clang/test/Headers/nvptx_device_math_complex.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include 
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
+void test_scmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
+
+// CHECK-DAG: call double @__nv_scalbn(
+void test_dcmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
Index: clang/test/Headers/nvptx_device_math_complex.c
===
--- clang/test/Headers/nvptx_device_math_complex.c
+++ clang/test/Headers/nvptx_device_math_complex.c
@@ -1,10 +1,22 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
 // expected-no-diagnostics
 
-// CHECK-DAG: call { float, float } @__divsc3(
-// CHECK-DAG: call { float, float } @__mulsc3(
+#ifdef __cplusplus
+#include 
+#else
+#include 
+#endif
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
 void test_scmplx(float _Complex a) {
 #pragma omp target
   {
@@ -12,9 +24,7 @@
   }
 }
 
-
-// CHECK-DAG: call { double, double } @__divdc3(
-// CHECK-DAG: call { double, double } @__muldc3(
+// CHECK-DAG: call double @__nv_scalbn(
 void test_dcmplx(double _Complex a) {
 #pragma omp target
   {
Index: clang/test/Headers/Inputs/include/cstdlib
===
--- clang/test/Headers/Inputs/include/cstdlib
+++ clang/test/Headers/Inputs/include/cstdlib
@@ -24,4 +24,8 @@
 abs(long long __x) { return __builtin_llabs (__x); }
 
 float fabs(float __x) { return __builtin_fabs(__x); }
+
+float abs(float __x) { return fabs(__x); 

[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-07-07 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added a comment.

LGTM.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-07-07 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert updated this revision to Diff 276053.
jdoerfert marked an inline comment as done.
jdoerfert added a comment.

Addressed comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897

Files:
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_complex_builtins.h
  clang/lib/Headers/__clang_cuda_math.h
  clang/lib/Headers/openmp_wrappers/complex
  clang/lib/Headers/openmp_wrappers/complex.h
  clang/test/Headers/Inputs/include/cmath
  clang/test/Headers/Inputs/include/complex
  clang/test/Headers/Inputs/include/cstdlib
  clang/test/Headers/nvptx_device_math_complex.c
  clang/test/Headers/nvptx_device_math_complex.cpp

Index: clang/test/Headers/nvptx_device_math_complex.cpp
===
--- /dev/null
+++ clang/test/Headers/nvptx_device_math_complex.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include 
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
+void test_scmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
+
+// CHECK-DAG: call double @__nv_scalbn(
+void test_dcmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
Index: clang/test/Headers/nvptx_device_math_complex.c
===
--- clang/test/Headers/nvptx_device_math_complex.c
+++ clang/test/Headers/nvptx_device_math_complex.c
@@ -1,10 +1,22 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
 // expected-no-diagnostics
 
-// CHECK-DAG: call { float, float } @__divsc3(
-// CHECK-DAG: call { float, float } @__mulsc3(
+#ifdef __cplusplus
+#include 
+#else
+#include 
+#endif
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
 void test_scmplx(float _Complex a) {
 #pragma omp target
   {
@@ -12,9 +24,7 @@
   }
 }
 
-
-// CHECK-DAG: call { double, double } @__divdc3(
-// CHECK-DAG: call { double, double } @__muldc3(
+// CHECK-DAG: call double @__nv_scalbn(
 void test_dcmplx(double _Complex a) {
 #pragma omp target
   {
Index: clang/test/Headers/Inputs/include/cstdlib
===
--- clang/test/Headers/Inputs/include/cstdlib
+++ clang/test/Headers/Inputs/include/cstdlib
@@ -24,4 +24,8 @@
 abs(long long __x) { return __builtin_llabs (__x); }
 
 float fabs(float __x) { return __builtin_fabs(__x); }
+
+float abs(float __x) { return fabs(__x); }
+double abs(double __x) { return fabs(__x); }
+
 }
Index: clang/test/Headers/Inputs/include/complex

[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-07-02 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added a comment.

@JonChesterfield @hfinkel @tra ping

I would really like to land this before the release branches off to allow 
people to use complex in target regions.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-07-02 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield accepted this revision.
JonChesterfield added a comment.
This revision is now accepted and ready to land.

I think this change is good. The library story is a bit difficult, but 
fundamentally openmp needs a shim of some sort to map target math functions 
onto the libm of the underlying device.

For nvptx, that's the cuda library. Amdgcn has math functions and may need 
another shim to map them to libm.

include_next is nasty, but that's the existing pattern for some library headers.




Comment at: clang/test/Headers/Inputs/include/complex:10
+// Taken from libc++
+template 
+class complex {

Can we #include from libc++ instead? Needs some cmake to skip the test if the 
library is unavailable but spares duplicating this class


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-24 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added a comment.

@tra  After chatting with @hfinkel I know now why we don't see the calls in the 
libc++ case. libc++ implements std::complex without `_Complex` types, stdlib++ 
does. If the user uses `_Complex` directly we need these functions for sure as 
the standard defines them (I think): https://godbolt.org/z/jcXgnH

So we need them and I would like to reuse them in the OpenMP offload path :)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-03 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert marked an inline comment as done.
jdoerfert added inline comments.



Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:42
+#define _LOGBf _LOGBd
+#else
+#define _ISNANd isnan

This will actually not work right now as we do not overload 
isinf/isnan/isfinite properly in C++ mode. I first have to find a solution for 
that mess.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-02 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert marked an inline comment as done.
jdoerfert added a comment.

I tried to determine why we don't emit such calls for c++11 and stdc++ but I 
was not successful :( Tracking back from the emission lead to the generic 
expression codegen without any (obvious) check of the runtime library or std 
versions.




Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:136-137
+  __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
+  if (_ISNANf(__a))
+__a = _COPYSIGNf(0, __a);
+  if (_ISNANf(__b))

arsenm wrote:
> Why does this try to preserve the sign of a nan? They are meaningless
Idk [I only work here... ;)]

I guess the algorithm was once copied from libc++, unclear if the one in there 
is still the same, we could check.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-01 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

In D80897#2066952 , @jdoerfert wrote:

> In D80897#2066723 , @tra wrote:
>
> > Hmm. I'm pretty sure tensorflow is using std::complex for various types. 
> > I'm surprised that we haven't seen these functions missing.
>
>
> Which functions and missing from where? In CUDA-mode we did provide 
> `__c3` already.


I mean the `__c3` functions added by the patch. I've tried with clang as it 
is now, before your patch.

> 
> 
>> Plain CUDA (e.g. https://godbolt.org/z/Us6oXC) code appears to have no 
>> references to `__mul*` or `__div*`, at least for optimized builds, but they 
>> do popup in unoptimized ones. Curiously enough, unoptimized code compiled 
>> with `-stdlib=libc++ --std=c++11` does not need the soft-float functions. 
>> That would explain why we don't see the build breaks.
> 
> Its not that simple, and tbh, I don't have the full picture yet. Plain 
> (clang) CUDA uses these functions (https://godbolt.org/z/dp_FY2), they just 
> disappear after inlining because of the linkage. If you however enable 
> `-fast-math` they are not used (https://godbolt.org/z/_N-STh). I couldn't run 
> with stdlib=libc++ locally and godbold cuts of the output so I'm not sure if 
> they are used and inlined or not used.

I've checked it locally and verified that adding `--stdlib=libc++ -std=c++11` 
to your first example shows that `__*c3` functions do not appear in IR 
regardless of inlining or opt level.
I wonder what is that that libstdc++ does that makes those functions show up in 
IR. AFAICT, it's not invoked directly by the library, so it must be something 
clang has generated. Perhaps something should be fixed there.

>> These differences suggest that these changes may need to be more nuanced 
>> with regard to the standard c++ library version and, possibly, the C++ 
>> standard used.
>>  If possible, I would prefer to limit interference with the standard 
>> libraries only to the cases where it's necessary.
> 
> The way I understand this is that we can always provide correct weak versions 
> of `__c3` without any correctness issues. They will be stripped if they 
> are not needed anyway. That said, this patch should not modify the CUDA 
> behavior (except minor float vs double corrections in the `__c3` 
> methods). Could you elaborate what interference you expect?

One example would be if/when we grow a better libm support for GPUs. Granted, 
it's just few functions and we could just remove these instances then.
I agree that adding these functions now will probably not interfere with 
anything we have now -- they are device-side overloads and nobody calls them 
directly.
The suggestion was based on a general principle of minimizing the changes that 
overlap with the standard libraries -- there are quite a few versions out there 
and I can't predict what quirks of theirs I'm not aware of. I've been burned 
too many times by that to be wary.




Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:63
+
+__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
+double __d) {

jdoerfert wrote:
> tra wrote:
> > Soft-float library has bunch of other functions. 
> > https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html
> > 
> > I wonder why only the complex variants of the soft-float support functions 
> > are missing. 
> > Does it mean that x86 code also does rely on the library to do complex 
> > multiplication?
> > If x86 can do complex ops, why can't nvptx?
> > If x86 can't, would make sense to teach it?
> > I wonder why only the complex variants of the soft-float support functions 
> > are missing.
> 
> I would guess others are conceptually missing too, the question is if we need 
> them. I did grep the clang source for 7 non-complex soft-float support 
> functions from the different categories listed in the gcc docs, none was 
> found.
> 
> > Does it mean that x86 code also does rely on the library to do complex 
> > multiplication?
> 
> I think so, yes. Some system library will provide the implementation of 
> `__muldc3` for the slow path of a complex multiplication.
> 
> > If x86 can do complex ops, why can't nvptx?
> > If x86 can't, would make sense to teach it?
> 
> I think I don't understand this (and maybe the question above). What we do in 
> CUDA right now, and with this patch in OpenMP, is to provide the `__c3` 
> functions on the device. Usually they are in some system library that we just 
> not have on the device so we have to add them somehow. 
I'm OK with providing device-side equivalents of the host standard library.

What' I'm trying to figure out if why we don't need to do it in some cases.
In case whe we do rely on these functions, but don't have them, we have at 
least two choices -- provide the missing functions (this patch) or ensure we 
never need these functions 

[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-01 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added inline comments.



Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:136-137
+  __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
+  if (_ISNANf(__a))
+__a = _COPYSIGNf(0, __a);
+  if (_ISNANf(__b))

Why does this try to preserve the sign of a nan? They are meaningless


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-01 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert marked 2 inline comments as done.
jdoerfert added a comment.

In D80897#2066723 , @tra wrote:

> Hmm. I'm pretty sure tensorflow is using std::complex for various types. I'm 
> surprised that we haven't seen these functions missing.


Which functions and missing from where? In CUDA-mode we did provide `__c3` 
already.

> Plain CUDA (e.g. https://godbolt.org/z/Us6oXC) code appears to have no 
> references to `__mul*` or `__div*`, at least for optimized builds, but they 
> do popup in unoptimized ones. Curiously enough, unoptimized code compiled 
> with `-stdlib=libc++ --std=c++11` does not need the soft-float functions. 
> That would explain why we don't see the build breaks.

Its not that simple, and tbh, I don't have the full picture yet. Plain (clang) 
CUDA uses these functions (https://godbolt.org/z/dp_FY2), they just disappear 
after inlining because of the linkage. If you however enable `-fast-math` they 
are not used (https://godbolt.org/z/_N-STh). I couldn't run with stdlib=libc++ 
locally and godbold cuts of the output so I'm not sure if they are used and 
inlined or not used.

> These differences suggest that these changes may need to be more nuanced with 
> regard to the standard c++ library version and, possibly, the C++ standard 
> used.
>  If possible, I would prefer to limit interference with the standard 
> libraries only to the cases where it's necessary.

The way I understand this is that we can always provide correct weak versions 
of `__c3` without any correctness issues. They will be stripped if they are 
not needed anyway. That said, this patch should not modify the CUDA behavior 
(except minor float vs double corrections in the `__c3` methods). Could you 
elaborate what interference you expect?




Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:29
+#define _ISNANd std::isnan
+#define _ISNANf _ISNANd
+#define _ISINFd std::isinf

tra wrote:
> Nit: this creates impression that we fall back on `double` variant of the 
> function, while in reality we'll end up using `std::isnan`.
> Perhaps it would be better to use fully specialized function template name in 
> all these macros. It would also avoid potential issues if someone/somewhere 
> adds other overloads. E.g. we may end up facing `std::complex` which 
> may overload resolution ambiguous in some cases. 
No problem. I'll just use std::NAME for all of them.



Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:63
+
+__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
+double __d) {

tra wrote:
> Soft-float library has bunch of other functions. 
> https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html
> 
> I wonder why only the complex variants of the soft-float support functions 
> are missing. 
> Does it mean that x86 code also does rely on the library to do complex 
> multiplication?
> If x86 can do complex ops, why can't nvptx?
> If x86 can't, would make sense to teach it?
> I wonder why only the complex variants of the soft-float support functions 
> are missing.

I would guess others are conceptually missing too, the question is if we need 
them. I did grep the clang source for 7 non-complex soft-float support 
functions from the different categories listed in the gcc docs, none was found.

> Does it mean that x86 code also does rely on the library to do complex 
> multiplication?

I think so, yes. Some system library will provide the implementation of 
`__muldc3` for the slow path of a complex multiplication.

> If x86 can do complex ops, why can't nvptx?
> If x86 can't, would make sense to teach it?

I think I don't understand this (and maybe the question above). What we do in 
CUDA right now, and with this patch in OpenMP, is to provide the `__c3` 
functions on the device. Usually they are in some system library that we just 
not have on the device so we have to add them somehow. 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-06-01 Thread Artem Belevich via Phabricator via cfe-commits
tra added a comment.

Hmm. I'm pretty sure tensorflow is using std::complex for various types. I'm 
surprised that we haven't seen these functions missing.
Plain CUDA (e.g. https://godbolt.org/z/Us6oXC) code appears to have no 
references to `__mul*` or `__div*`, at least for optimized builds, but they do 
popup in unoptimized ones. Curiously enough, unoptimized code compiled with 
`-stdlib=libc++ --std=c++11` does not need the soft-float functions. That would 
explain why we don't see the build breaks.

These differences suggest that these changes may need to be more nuanced with 
regard to the standard c++ library version and, possibly, the C++ standard used.
If possible, I would prefer to limit interference with the standard libraries 
only to the cases where it's necessary.




Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:29
+#define _ISNANd std::isnan
+#define _ISNANf _ISNANd
+#define _ISINFd std::isinf

Nit: this creates impression that we fall back on `double` variant of the 
function, while in reality we'll end up using `std::isnan`.
Perhaps it would be better to use fully specialized function template name in 
all these macros. It would also avoid potential issues if someone/somewhere 
adds other overloads. E.g. we may end up facing `std::complex` which may 
overload resolution ambiguous in some cases. 



Comment at: clang/lib/Headers/__clang_cuda_complex_builtins.h:63
+
+__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
+double __d) {

Soft-float library has bunch of other functions. 
https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html

I wonder why only the complex variants of the soft-float support functions are 
missing. 
Does it mean that x86 code also does rely on the library to do complex 
multiplication?
If x86 can do complex ops, why can't nvptx?
If x86 can't, would make sense to teach it?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-05-31 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert updated this revision to Diff 267531.
jdoerfert added a comment.

Fix tests, add C support


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D80897/new/

https://reviews.llvm.org/D80897

Files:
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_complex_builtins.h
  clang/lib/Headers/__clang_cuda_math.h
  clang/lib/Headers/openmp_wrappers/complex
  clang/lib/Headers/openmp_wrappers/complex.h
  clang/test/Headers/Inputs/include/cmath
  clang/test/Headers/Inputs/include/complex
  clang/test/Headers/Inputs/include/cstdlib
  clang/test/Headers/nvptx_device_math_complex.c
  clang/test/Headers/nvptx_device_math_complex.cpp

Index: clang/test/Headers/nvptx_device_math_complex.cpp
===
--- /dev/null
+++ clang/test/Headers/nvptx_device_math_complex.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include 
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
+void test_scmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
+
+// CHECK-DAG: call double @__nv_scalbn(
+void test_dcmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
Index: clang/test/Headers/nvptx_device_math_complex.c
===
--- clang/test/Headers/nvptx_device_math_complex.c
+++ clang/test/Headers/nvptx_device_math_complex.c
@@ -1,10 +1,22 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
 // expected-no-diagnostics
 
-// CHECK-DAG: call { float, float } @__divsc3(
-// CHECK-DAG: call { float, float } @__mulsc3(
+#ifdef __cplusplus
+#include 
+#else
+#include 
+#endif
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+// CHECK-DAG: call float @__nv_scalbnf(
 void test_scmplx(float _Complex a) {
 #pragma omp target
   {
@@ -12,9 +24,7 @@
   }
 }
 
-
-// CHECK-DAG: call { double, double } @__divdc3(
-// CHECK-DAG: call { double, double } @__muldc3(
+// CHECK-DAG: call double @__nv_scalbn(
 void test_dcmplx(double _Complex a) {
 #pragma omp target
   {
Index: clang/test/Headers/Inputs/include/cstdlib
===
--- clang/test/Headers/Inputs/include/cstdlib
+++ clang/test/Headers/Inputs/include/cstdlib
@@ -24,4 +24,8 @@
 abs(long long __x) { return __builtin_llabs (__x); }
 
 float fabs(float __x) { return __builtin_fabs(__x); }
+
+float abs(float __x) { return fabs(__x); }
+double abs(double __x) { return fabs(__x); }
+
 }
Index: clang/test/Headers/Inputs/include/complex
===

[PATCH] D80897: [OpenMP] Initial support for std::complex in target regions

2020-05-31 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert created this revision.
jdoerfert added reviewers: tra, hfinkel, ABataev, JonChesterfield.
Herald added subscribers: sstefan1, guansong, bollu, yaxunl, mgorny.
Herald added a project: clang.

This simply follows the scheme we have for other wrappers. It resolves
the current link problem, e.g., `__muldc3 not found`, when std::complex
operations are used on a device.

In "CUDA mode" this should allow simple complex operations to work in
target regions. Normal mode doesn't work because the globalization in
the std::complex operators is somehow broken. This will most likely not
allow complex make math function calls to work properly, e.g., sin, but
that is more complex (pan intended) anyway.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D80897

Files:
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/__clang_cuda_complex_builtins.h
  clang/lib/Headers/openmp_wrappers/complex
  clang/test/Headers/Inputs/include/complex
  clang/test/Headers/Inputs/include/cstdlib
  clang/test/Headers/Inputs/include/math.h
  clang/test/Headers/nvptx_device_math_complex.cpp

Index: clang/test/Headers/nvptx_device_math_complex.cpp
===
--- /dev/null
+++ clang/test/Headers/nvptx_device_math_complex.cpp
@@ -0,0 +1,25 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -aux-triple powerpc64le-unknown-unknown -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include 
+
+// CHECK-DAG: define {{.*}} @__mulsc3
+// CHECK-DAG: define {{.*}} @__muldc3
+// CHECK-DAG: define {{.*}} @__divsc3
+// CHECK-DAG: define {{.*}} @__divdc3
+
+void test_scmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
+
+void test_dcmplx(std::complex a) {
+#pragma omp target
+  {
+(void)(a * (a / a));
+  }
+}
Index: clang/test/Headers/Inputs/include/math.h
===
--- clang/test/Headers/Inputs/include/math.h
+++ clang/test/Headers/Inputs/include/math.h
@@ -107,6 +107,10 @@
 long lroundf(float __a);
 int max(int __a, int __b);
 int min(int __a, int __b);
+float max(float __a, float __b);
+float min(float __a, float __b);
+double max(double __a, double __b);
+double min(double __a, double __b);
 double modf(double __a, double *__b);
 float modff(float __a, float *__b);
 double nearbyint(double __a);
Index: clang/test/Headers/Inputs/include/cstdlib
===
--- clang/test/Headers/Inputs/include/cstdlib
+++ clang/test/Headers/Inputs/include/cstdlib
@@ -24,4 +24,8 @@
 abs(long long __x) { return __builtin_llabs (__x); }
 
 float fabs(float __x) { return __builtin_fabs(__x); }
+
+float abs(float __x) { return fabs(__x); }
+double abs(double __x) { return fabs(__x); }
+
 }
Index: clang/test/Headers/Inputs/include/complex
===
--- /dev/null
+++ clang/test/Headers/Inputs/include/complex
@@ -0,0 +1,301 @@
+#pragma once
+
+#include 
+
+#define INFINITY (__builtin_inff())
+
+namespace std {
+
+// Taken from libc++
+template 
+class complex {
+public:
+  typedef _Tp value_type;
+
+private:
+  value_type __re_;
+  value_type __im_;
+
+public:
+  complex(const value_type &__re = value_type(), const value_type &__im = value_type())
+  : __re_(__re), __im_(__im) {}
+  template 
+  complex(const complex<_Xp> &__c)
+  : __re_(__c.real()), __im_(__c.imag()) {}
+
+  value_type real() const { return __re_; }
+  value_type imag() const { return __im_; }
+
+  void real(value_type __re) { __re_ = __re; }
+  void imag(value_type __im) { __im_ = __im; }
+
+  complex =(const value_type &__re) {
+__re_ = __re;
+__im_ = value_type();
+return *this;
+  }
+  complex +=(const value_type &__re) {
+__re_ += __re;
+return *this;
+  }
+  complex =(const value_type &__re) {
+__re_ -= __re;
+return *this;
+  }
+  complex *=(const value_type &__re) {
+__re_ *= __re;
+__im_ *= __re;
+return *this;
+  }
+  complex /=(const value_type &__re) {
+__re_ /= __re;
+__im_ /= __re;
+return *this;
+  }
+
+  template 
+  complex =(const complex<_Xp> &__c) {
+__re_ = __c.real();
+__im_ = __c.imag();
+return *this;
+  }
+  template 
+  complex +=(const complex<_Xp> &__c) {
+__re_ += __c.real();
+__im_ += __c.imag();
+return *this;
+  }
+  template 
+  complex =(const complex<_Xp> &__c) {
+__re_ -= __c.real();
+__im_ -=