[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-11-08 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert closed this revision.
jdoerfert added a comment.

Committed as part of D112680 .


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-27 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield updated this revision to Diff 382747.
JonChesterfield added a comment.

-rebase on main


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGGPUBuiltin.cpp
  openmp/libomptarget/DeviceRTL/include/Debug.h
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Debug.cpp

Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -35,6 +35,15 @@
  assertion);
   __builtin_trap();
 }
+
+// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+int32_t vprintf(const char *, void *) { return 0; }
+#pragma omp end declare variant
+
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments) {
+  return vprintf(Format, Arguments);
+}
 }
 
 /// Current indentation level for the function trace. Only accessed by thread 0.
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -352,6 +352,9 @@
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Printf
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments);
 }
 
 #endif
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -32,17 +32,12 @@
 /// macro.
 /// {
 
-#ifndef __AMDGCN__
 extern "C" {
 int printf(const char *format, ...);
 }
 
 #define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__);
 #define PRINT(str) PRINTF("%s", str)
-#else
-#define PRINTF(fmt, ...)
-#define PRINT(str)
-#endif
 
 ///}
 
Index: clang/lib/CodeGen/CGGPUBuiltin.cpp
===
--- clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -21,24 +21,30 @@
 using namespace clang;
 using namespace CodeGen;
 
-static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+static llvm::Function *GetVprintfDeclaration(CodeGenModule &CGM) {
+  bool UsesNewOpenMPDeviceRuntime = CGM.getLangOpts().OpenMPIsDevice &&
+CGM.getLangOpts().OpenMPTargetNewRuntime;
+  const char *Name =
+  UsesNewOpenMPDeviceRuntime ? "__llvm_omp_vprintf" : "vprintf";
+  llvm::Module &M = CGM.getModule();
   llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
 llvm::Type::getInt8PtrTy(M.getContext())};
   llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
   llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
 
-  if (auto* F = M.getFunction("vprintf")) {
+  if (auto *F = M.getFunction(Name)) {
 // Our CUDA system header declares vprintf with the right signature, so
 // nobody else should have been able to declare vprintf with a bogus
-// signature.
+// signature. The OpenMP device runtime provides a wrapper around vprintf
+// which we use here. The signature should match though.
 assert(F->getFunctionType() == VprintfFuncType);
 return F;
   }
 
-  // vprintf doesn't already exist; create a declaration and insert it into the
-  // module.
+  // vprintf, or for OpenMP device offloading the vprintf wrapper, doesn't
+  // already exist; create a declaration and insert it into the module.
   return llvm::Function::Create(
-  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
+  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
 }
 
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
@@ -117,7 +123,7 @@
   }
 
   // Invoke vprintf and return.
-  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  llvm::Function *VprintfFunc = GetVprintfDeclaration(CGM);
   return RValue::get(Builder.CreateCall(
   VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }
@@ -130,6 +136,12 @@
  E->getBuiltinCallee() == Builtin::BI__builtin_printf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
 
+  // For OpenMP target offloading we go with a modified nvptx printf method.
+  // Basically creating calls to __llvm_omp_vprintf with the arguments and
+  // dealing with the details in the device runtime itself.
+  if (getLangOpts().OpenMPIsDevice && getLangOpts().OpenMPTargetNewRuntime)
+return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
+
   CallArgList CallArgs;
   EmitCallArgs(CallArg

[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-27 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

This doesn't apply against main, diff relative to something that isn't in main


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-26 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

Nice! Yep, can add a size argument later. Will want it to control copying the 
payload over to the host


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-26 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert added inline comments.



Comment at: clang/lib/CodeGen/CGGPUBuiltin.cpp:128
   return RValue::get(Builder.CreateCall(
   VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }

>>! In D112504#3086474, @JonChesterfield wrote:
> That's an interesting approach.
> 
> Do you happen to know where I can find details of the data format behind that 
> void*? Have been meaning to look at writing printf for amdgpu as host side 
> decoding of that buffer. If the compiler knows how long it is, that would be 
> a useful third argument.

We actually do know. Above we allocate and fill the buffer. For the OpenMP 
wrapper you could easily add a third argument later in order to facilitate an 
OpenMP runtime printf impl. I would even like it to be target agnostic (e.g., 
replace the default CUDA route on request). That said, we should tackle that 
separately, wdyt?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-25 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

That's an interesting approach.

Do you happen to know where I can find details of the data format behind that 
void*? Have been meaning to look at writing printf for amdgpu as host side 
decoding of that buffer. If the compiler knows how long it is, that would be a 
useful third argument.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-25 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert updated this revision to Diff 382171.
jdoerfert added a comment.

Actually use the new wrapper for OpenMP offload targeting AMD (and the new RT)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112504/new/

https://reviews.llvm.org/D112504

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGGPUBuiltin.cpp
  openmp/libomptarget/DeviceRTL/include/Debug.h
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Debug.cpp

Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -38,6 +38,15 @@
  assertion);
   __builtin_trap();
 }
+
+// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+int32_t vprintf(const char *, void *) { return 0; }
+#pragma omp end declare variant
+
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments) {
+  return vprintf(Format, Arguments);
+}
 }
 
 /// Current indentation level for the function trace. Only accessed by thread 0.
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -340,6 +340,9 @@
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Printf
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments);
 }
 
 #endif
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -46,17 +46,12 @@
 /// macro.
 /// {
 
-#ifndef __AMDGCN__
 extern "C" {
 int printf(const char *format, ...);
 }
 
 #define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__)
 #define PRINT(str) PRINTF("%s", str)
-#else
-#define PRINTF(fmt, ...)
-#define PRINT(str)
-#endif
 
 #define WARN(fmt, ...) PRINTF("WARNING: " #fmt, __VA_ARGS__)
 
Index: clang/lib/CodeGen/CGGPUBuiltin.cpp
===
--- clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -21,24 +21,30 @@
 using namespace clang;
 using namespace CodeGen;
 
-static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+static llvm::Function *GetVprintfDeclaration(CodeGenModule &CGM) {
+  bool UsesNewOpenMPDeviceRuntime = CGM.getLangOpts().OpenMPIsDevice &&
+CGM.getLangOpts().OpenMPTargetNewRuntime;
+  const char *Name =
+  UsesNewOpenMPDeviceRuntime ? "__llvm_omp_vprintf" : "vprintf";
+  llvm::Module &M = CGM.getModule();
   llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
 llvm::Type::getInt8PtrTy(M.getContext())};
   llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
   llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
 
-  if (auto* F = M.getFunction("vprintf")) {
+  if (auto *F = M.getFunction(Name)) {
 // Our CUDA system header declares vprintf with the right signature, so
 // nobody else should have been able to declare vprintf with a bogus
-// signature.
+// signature. The OpenMP device runtime provides a wrapper around vprintf
+// which we use here. The signature should match though.
 assert(F->getFunctionType() == VprintfFuncType);
 return F;
   }
 
-  // vprintf doesn't already exist; create a declaration and insert it into the
-  // module.
+  // vprintf, or for OpenMP device offloading the vprintf wrapper, doesn't
+  // already exist; create a declaration and insert it into the module.
   return llvm::Function::Create(
-  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
+  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
 }
 
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
@@ -117,7 +123,7 @@
   }
 
   // Invoke vprintf and return.
-  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  llvm::Function *VprintfFunc = GetVprintfDeclaration(CGM);
   return RValue::get(Builder.CreateCall(
   VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }
@@ -130,6 +136,12 @@
  E->getBuiltinCallee() == Builtin::BI__builtin_printf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
 
+  // For OpenMP target offloading we go with a modified nvptx printf method.
+  // Basically creating calls to __llvm_omp_vprintf with the arguments and
+  // dealing with the details in the device runtime itself.
+  if (getLangOpts().OpenMPIsDevice && getLangOpts().OpenMPTargetNewRuntime)
+  

[PATCH] D112504: [OpenMP] Wrap (v)printf in the new RT and use same handling for AMD

2021-10-25 Thread Johannes Doerfert via Phabricator via cfe-commits
jdoerfert created this revision.
jdoerfert added reviewers: JonChesterfield, tianshilei1992, jhuber6.
Herald added subscribers: guansong, bollu, yaxunl.
jdoerfert requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

To support `printf` NVPTX and AMD targets are handled differently. The
latter performs host calls, which we don't really want in OpenMP, the
former translates `printf` calls to `vprintf` calls as the NVIDIA
runtime provides an implementation for the device of `vprintf`. This
patch unifies the AMD and NVPTX handling and emits for both calls to the
`vprintf` wrapper `__llvm_omp_vprintf` which we define in our new device
runtime. The main benefit of this wrapper is that we can more easily
control (and profile) the emission of `printf` calls in device code.

Note: Tests are coming.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D112504

Files:
  clang/lib/CodeGen/CGGPUBuiltin.cpp
  openmp/libomptarget/DeviceRTL/include/Debug.h
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/src/Debug.cpp

Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -38,6 +38,15 @@
  assertion);
   __builtin_trap();
 }
+
+// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+int32_t vprintf(const char *, void *) { return 0; }
+#pragma omp end declare variant
+
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments) {
+  return vprintf(Format, Arguments);
+}
 }
 
 /// Current indentation level for the function trace. Only accessed by thread 0.
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -340,6 +340,9 @@
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Printf
+int32_t __llvm_omp_vprintf(const char *Format, void *Arguments);
 }
 
 #endif
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -46,17 +46,12 @@
 /// macro.
 /// {
 
-#ifndef __AMDGCN__
 extern "C" {
 int printf(const char *format, ...);
 }
 
 #define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__)
 #define PRINT(str) PRINTF("%s", str)
-#else
-#define PRINTF(fmt, ...)
-#define PRINT(str)
-#endif
 
 #define WARN(fmt, ...) PRINTF("WARNING: " #fmt, __VA_ARGS__)
 
Index: clang/lib/CodeGen/CGGPUBuiltin.cpp
===
--- clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -21,24 +21,30 @@
 using namespace clang;
 using namespace CodeGen;
 
-static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+static llvm::Function *GetVprintfDeclaration(CodeGenModule &CGM) {
+  bool UsesNewOpenMPDeviceRuntime = CGM.getLangOpts().OpenMPIsDevice &&
+CGM.getLangOpts().OpenMPTargetNewRuntime;
+  const char *Name =
+  UsesNewOpenMPDeviceRuntime ? "__llvm_omp_vprintf" : "vprintf";
+  llvm::Module &M = CGM.getModule();
   llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
 llvm::Type::getInt8PtrTy(M.getContext())};
   llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
   llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
 
-  if (auto* F = M.getFunction("vprintf")) {
+  if (auto *F = M.getFunction(Name)) {
 // Our CUDA system header declares vprintf with the right signature, so
 // nobody else should have been able to declare vprintf with a bogus
-// signature.
+// signature. The OpenMP device runtime provides a wrapper around vprintf
+// which we use here. The signature should match though.
 assert(F->getFunctionType() == VprintfFuncType);
 return F;
   }
 
-  // vprintf doesn't already exist; create a declaration and insert it into the
-  // module.
+  // vprintf, or for OpenMP device offloading the vprintf wrapper, doesn't
+  // already exist; create a declaration and insert it into the module.
   return llvm::Function::Create(
-  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
+  VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
 }
 
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
@@ -117,7 +123,7 @@
   }
 
   // Invoke vprintf and return.
-  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  llvm::Function *Vpr