[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-31 Thread Artem Belevich via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC352799: [CUDA] add support for the new kernel launch API in 
CUDA-9.2+. (authored by tra, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D57488?vs=184592=184598#toc

Repository:
  rC Clang

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D57488/new/

https://reviews.llvm.org/D57488

Files:
  include/clang/Basic/DiagnosticSemaKinds.td
  include/clang/Sema/Sema.h
  lib/CodeGen/CGCUDANV.cpp
  lib/Headers/__clang_cuda_runtime_wrapper.h
  lib/Sema/SemaCUDA.cpp
  lib/Sema/SemaDecl.cpp
  test/CodeGenCUDA/Inputs/cuda.h
  test/CodeGenCUDA/device-stub.cu
  test/CodeGenCUDA/kernel-args-alignment.cu
  test/CodeGenCUDA/kernel-call.cu
  test/Driver/cuda-simple.cu
  test/SemaCUDA/Inputs/cuda.h
  test/SemaCUDA/config-type.cu
  unittests/ASTMatchers/ASTMatchersTest.h

Index: lib/Headers/__clang_cuda_runtime_wrapper.h
===
--- lib/Headers/__clang_cuda_runtime_wrapper.h
+++ lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -426,5 +426,15 @@
 #pragma pop_macro("__USE_FAST_MATH__")
 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
 
+// CUDA runtime uses this undocumented function to access kernel launch
+// configuration. The declaration is in crt/device_functions.h but that file
+// includes a lot of other stuff we don't want. Instead, we'll provide our own
+// declaration for it here.
+#if CUDA_VERSION >= 9020
+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+size_t sharedMem = 0,
+void *stream = 0);
+#endif
+
 #endif // __CUDA__
 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
Index: lib/CodeGen/CGCUDANV.cpp
===
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,6 +15,8 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -102,7 +104,8 @@
 return DummyFunc;
   }
 
-  void emitDeviceStubBody(CodeGenFunction , FunctionArgList );
+  void emitDeviceStubBodyLegacy(CodeGenFunction , FunctionArgList );
+  void emitDeviceStubBodyNew(CodeGenFunction , FunctionArgList );
 
 public:
   CGNVCUDARuntime(CodeGenModule );
@@ -187,11 +190,110 @@
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction ,
  FunctionArgList ) {
   EmittedKernels.push_back(CGF.CurFn);
-  emitDeviceStubBody(CGF, Args);
+  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
+ CudaFeature::CUDA_USES_NEW_LAUNCH))
+emitDeviceStubBodyNew(CGF, Args);
+  else
+emitDeviceStubBodyLegacy(CGF, Args);
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction ,
+FunctionArgList ) {
+  // Build the shadow stack entry at the very start of the function.
+
+  // Calculate amount of space we will need for all arguments.  If we have no
+  // args, allocate a single pointer so we still have a valid pointer to the
+  // argument array that we can pass to runtime, even if it will be unused.
+  Address KernelArgs = CGF.CreateTempAlloca(
+  VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+  llvm::ConstantInt::get(SizeTy, std::max(1, Args.size(;
+  // Store pointers to the arguments in a locally allocated launch_args.
+  for (unsigned i = 0; i < Args.size(); ++i) {
+llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
+llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
+CGF.Builder.CreateDefaultAlignedStore(
+VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
+  }
+
+  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+
+  // Lookup cudaLaunchKernel function.
+  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  //  void **args, size_t sharedMem,
+  //  cudaStream_t stream);
+  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+  IdentifierInfo  =
+  CGM.getContext().Idents.get("cudaLaunchKernel");
+  FunctionDecl *cudaLaunchKernelFD = nullptr;
+  for (const auto  : DC->lookup()) {
+if (FunctionDecl *FD = dyn_cast(Result))
+  cudaLaunchKernelFD = FD;
+  }
+
+  if (cudaLaunchKernelFD == nullptr) {
+CGM.Error(CGF.CurFuncDecl->getLocation(),
+  "Can't find declaration for cudaLaunchKernel()");
+

[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-31 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 184592.
tra added a comment.

Updated ASTMatchers unit test.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D57488/new/

https://reviews.llvm.org/D57488

Files:
  clang/include/clang/Basic/DiagnosticSemaKinds.td
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/Headers/__clang_cuda_runtime_wrapper.h
  clang/lib/Sema/SemaCUDA.cpp
  clang/lib/Sema/SemaDecl.cpp
  clang/test/CodeGenCUDA/Inputs/cuda.h
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/kernel-args-alignment.cu
  clang/test/CodeGenCUDA/kernel-call.cu
  clang/test/Driver/cuda-simple.cu
  clang/test/SemaCUDA/Inputs/cuda.h
  clang/test/SemaCUDA/config-type.cu
  clang/unittests/ASTMatchers/ASTMatchersTest.h

Index: clang/unittests/ASTMatchers/ASTMatchersTest.h
===
--- clang/unittests/ASTMatchers/ASTMatchersTest.h
+++ clang/unittests/ASTMatchers/ASTMatchersTest.h
@@ -183,7 +183,9 @@
   "typedef struct cudaStream *cudaStream_t;"
   "int cudaConfigureCall(dim3 gridSize, dim3 blockSize,"
   "  size_t sharedSize = 0,"
-  "  cudaStream_t stream = 0);";
+  "  cudaStream_t stream = 0);"
+  "extern \"C\" unsigned __cudaPushCallConfiguration("
+  "dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0);";
 
   bool Found = false, DynamicFound = false;
   MatchFinder Finder;
Index: clang/test/SemaCUDA/config-type.cu
===
--- clang/test/SemaCUDA/config-type.cu
+++ clang/test/SemaCUDA/config-type.cu
@@ -1,3 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -target-sdk-version=8.0 -fsyntax-only -verify=legacy-launch %s
+// RUN: %clang_cc1 -target-sdk-version=9.2 -fsyntax-only -verify=new-launch %s
 
-void cudaConfigureCall(unsigned gridSize, unsigned blockSize); // expected-error {{must have scalar return type}}
+// legacy-launch-error@+1 {{must have scalar return type}}
+void cudaConfigureCall(unsigned gridSize, unsigned blockSize);
+// new-launch-error@+1 {{must have scalar return type}}
+void __cudaPushCallConfiguration(unsigned gridSize, unsigned blockSize);
Index: clang/test/SemaCUDA/Inputs/cuda.h
===
--- clang/test/SemaCUDA/Inputs/cuda.h
+++ clang/test/SemaCUDA/Inputs/cuda.h
@@ -18,9 +18,17 @@
 };
 
 typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
 
-int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-  cudaStream_t stream = 0);
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+   size_t sharedSize = 0,
+   cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+dim3 blockDim, void **args,
+size_t sharedMem, cudaStream_t stream);
 
 // Host- and device-side placement new overloads.
 void *operator new(__SIZE_TYPE__, void *p) { return p; }
Index: clang/test/Driver/cuda-simple.cu
===
--- clang/test/Driver/cuda-simple.cu
+++ clang/test/Driver/cuda-simple.cu
@@ -2,7 +2,7 @@
 // http://llvm.org/PR22936
 // RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s
 //
-// Verify that we pass -x cuda-cpp-output to compiler after 
+// Verify that we pass -x cuda-cpp-output to compiler after
 // preprocessing a CUDA file
 // RUN: %clang  -Werror -### -save-temps -c %s 2>&1 | FileCheck %s
 // CHECK: "-cc1"
@@ -14,7 +14,9 @@
 // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output".
 // RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s
 
-int cudaConfigureCall(int, int);
+extern "C" int cudaConfigureCall(int, int);
+extern "C" int __cudaPushCallConfiguration(int, int);
+
 __attribute__((global)) void kernel() {}
 
 void func() {
Index: clang/test/CodeGenCUDA/kernel-call.cu
===
--- clang/test/CodeGenCUDA/kernel-call.cu
+++ clang/test/CodeGenCUDA/kernel-call.cu
@@ -1,5 +1,9 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
-// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
+// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
+// RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
+// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
+// RUN: | 

[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-31 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:239
+CGM.Error(CGF.CurFuncDecl->getLocation(),
+  "Can't find declaration for cudaLaunchKernel()"); // FIXME.
+return;

jlebar wrote:
> Unfixed FIXME?
Fixed the comment. :-)
There's not much we can do if we have no declaration for cudaLaunchKernel, so 
throwing the error here is the best we can do.



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:260
+  /*isVarArg=*/false),
+  "__cudaPopCallConfiguration");
+

jlebar wrote:
> I see lots of references to `__cudaPushCallConfiguration`, but this is the 
> only reference I see to `__cudaPopCallConfiguration`.  Is this a typo?  Also 
> are we supposed to emit matching push and pop function calls?  Kind of weird 
> to do one without the other...
the `pop` part is indeed used only here. 
`Push` is something that takes user-specified parameters, so we get Sema to 
check them.
`Pop` is much simpler and does not have any direct user exposure, so we can 
just create and use it here.

As for matching, it is balanced. `Push` is called at the kernel launch site 
with the parameters of `<<<>>>` .`Pop` is done  in the host-side kernel stub 
where we retrieve those parameters and pass them to the CUDA runtime.

Essentially, push/pop are poor names for these functions are the nesting is 
never more than one level deep. We could've just stashed the arguments in a 
fixed buffer somewhere.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D57488/new/

https://reviews.llvm.org/D57488



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-31 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 184543.
tra marked 8 inline comments as done.
tra edited the summary of this revision.
tra added a comment.

Addressed Justin's comments.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D57488/new/

https://reviews.llvm.org/D57488

Files:
  clang/include/clang/Basic/DiagnosticSemaKinds.td
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/Headers/__clang_cuda_runtime_wrapper.h
  clang/lib/Sema/SemaCUDA.cpp
  clang/lib/Sema/SemaDecl.cpp
  clang/test/CodeGenCUDA/Inputs/cuda.h
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/kernel-args-alignment.cu
  clang/test/CodeGenCUDA/kernel-call.cu
  clang/test/Driver/cuda-simple.cu
  clang/test/SemaCUDA/Inputs/cuda.h
  clang/test/SemaCUDA/config-type.cu

Index: clang/test/SemaCUDA/config-type.cu
===
--- clang/test/SemaCUDA/config-type.cu
+++ clang/test/SemaCUDA/config-type.cu
@@ -1,3 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fno-cuda-new-launch -fsyntax-only -verify=legacy-launch %s
+// RUN: %clang_cc1 -fcuda-new-launch -fsyntax-only -verify=new-launch %s
 
-void cudaConfigureCall(unsigned gridSize, unsigned blockSize); // expected-error {{must have scalar return type}}
+// legacy-launch-error@+1 {{must have scalar return type}}
+void cudaConfigureCall(unsigned gridSize, unsigned blockSize);
+// new-launch-error@+1 {{must have scalar return type}}
+void __cudaPushCallConfiguration(unsigned gridSize, unsigned blockSize);
Index: clang/test/SemaCUDA/Inputs/cuda.h
===
--- clang/test/SemaCUDA/Inputs/cuda.h
+++ clang/test/SemaCUDA/Inputs/cuda.h
@@ -18,9 +18,17 @@
 };
 
 typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
 
-int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-  cudaStream_t stream = 0);
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+   size_t sharedSize = 0,
+   cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+dim3 blockDim, void **args,
+size_t sharedMem, cudaStream_t stream);
 
 // Host- and device-side placement new overloads.
 void *operator new(__SIZE_TYPE__, void *p) { return p; }
Index: clang/test/Driver/cuda-simple.cu
===
--- clang/test/Driver/cuda-simple.cu
+++ clang/test/Driver/cuda-simple.cu
@@ -2,7 +2,7 @@
 // http://llvm.org/PR22936
 // RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s
 //
-// Verify that we pass -x cuda-cpp-output to compiler after 
+// Verify that we pass -x cuda-cpp-output to compiler after
 // preprocessing a CUDA file
 // RUN: %clang  -Werror -### -save-temps -c %s 2>&1 | FileCheck %s
 // CHECK: "-cc1"
@@ -14,7 +14,9 @@
 // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output".
 // RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s
 
-int cudaConfigureCall(int, int);
+extern "C" int cudaConfigureCall(int, int);
+extern "C" int __cudaPushCallConfiguration(int, int);
+
 __attribute__((global)) void kernel() {}
 
 void func() {
Index: clang/test/CodeGenCUDA/kernel-call.cu
===
--- clang/test/CodeGenCUDA/kernel-call.cu
+++ clang/test/CodeGenCUDA/kernel-call.cu
@@ -1,5 +1,9 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
-// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
+// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
+// RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
+// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
 
 
 #include "Inputs/cuda.h"
@@ -7,14 +11,17 @@
 // CHECK-LABEL: define{{.*}}g1
 // HIP: call{{.*}}hipSetupArgument
 // HIP: call{{.*}}hipLaunchByPtr
-// CUDA: call{{.*}}cudaSetupArgument
-// CUDA: call{{.*}}cudaLaunch
+// CUDA-OLD: call{{.*}}cudaSetupArgument
+// CUDA-OLD: call{{.*}}cudaLaunch
+// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
+// CUDA-NEW: call{{.*}}cudaLaunchKernel
 __global__ void g1(int x) {}
 
 // CHECK-LABEL: define{{.*}}main
 int main(void) {
   // HIP: call{{.*}}hipConfigureCall
-  // CUDA: call{{.*}}cudaConfigureCall
+  // CUDA-OLD: call{{.*}}cudaConfigureCall
+  // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
   // CHECK: icmp
   // 

[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-30 Thread Justin Lebar via Phabricator via cfe-commits
jlebar accepted this revision.
jlebar added a comment.
This revision is now accepted and ready to land.

LGTM, mostly nits.




Comment at: clang/include/clang/Sema/Sema.h:10316
 
+  /// Returns the name of the launch configuration function.
+  std::string getCudaConfigureFuncName() const;

Could we be a little less vague, what exactly is the launch-configuration 
function?  (Could be as simple as adding `e.g. cudaFooBar()`.)



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:201
 
-void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction ,
- FunctionArgList ) {
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in local
+// array and kernels are launched using cudaLaunchKernel().

nit `in a local array`



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:212
+  VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+  llvm::ConstantInt::get(SizeTy, std::max(1UL, Args.size(;
+  // Store pointers to the arguments in a locally allocated launch_args.

Nit, s/`1UL`/`uint64{1}`/ or size_t, whatever this function takes.  As-is we're 
baking in the assumption that unsigned long is the same as the type returned by 
Args.size(), which isn't necessarily true.

As an alternative, you could do `std::max(1, Args.size())` or whatever 
the appropriate type is.



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:239
+CGM.Error(CGF.CurFuncDecl->getLocation(),
+  "Can't find declaration for cudaLaunchKernel()"); // FIXME.
+return;

Unfixed FIXME?



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:260
+  /*isVarArg=*/false),
+  "__cudaPopCallConfiguration");
+

I see lots of references to `__cudaPushCallConfiguration`, but this is the only 
reference I see to `__cudaPopCallConfiguration`.  Is this a typo?  Also are we 
supposed to emit matching push and pop function calls?  Kind of weird to do one 
without the other...



Comment at: clang/lib/CodeGen/CGCUDANV.cpp:266
+  // Emit the call to cudaLaunch
+
+  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);

Whitespace nit, maybe move this whitespace line before the comment?



Comment at: clang/lib/Headers/__clang_cuda_runtime_wrapper.h:429
 
+// CUDA runtime uses undocumented function to access kernel launch
+// configuration. We need to provide our own declaration for it here.

s/undocumented function/this undocumented function/?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D57488/new/

https://reviews.llvm.org/D57488



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D57488: [CUDA] add support for the new kernel launch API in CUDA-9.2+.

2019-01-30 Thread Artem Belevich via Phabricator via cfe-commits
tra created this revision.
tra added a reviewer: jlebar.
Herald added subscribers: bixia, sanjoy.

Instead of calling CUDA runtime to arrange function arguments, 
the new API constructs arguments in a local array and the kernels 
are launched with __cudaLaunchKernel().

The old API has been deprecated and is expected to go away
in the next CUDA release.


https://reviews.llvm.org/D57488

Files:
  clang/include/clang/Basic/DiagnosticSemaKinds.td
  clang/include/clang/Sema/Sema.h
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/Headers/__clang_cuda_runtime_wrapper.h
  clang/lib/Sema/SemaCUDA.cpp
  clang/lib/Sema/SemaDecl.cpp
  clang/test/CodeGenCUDA/Inputs/cuda.h
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/kernel-args-alignment.cu
  clang/test/CodeGenCUDA/kernel-call.cu
  clang/test/Driver/cuda-simple.cu
  clang/test/SemaCUDA/Inputs/cuda.h
  clang/test/SemaCUDA/config-type.cu

Index: clang/test/SemaCUDA/config-type.cu
===
--- clang/test/SemaCUDA/config-type.cu
+++ clang/test/SemaCUDA/config-type.cu
@@ -1,3 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fno-cuda-new-launch -fsyntax-only -verify=legacy-launch %s
+// RUN: %clang_cc1 -fcuda-new-launch -fsyntax-only -verify=new-launch %s
 
-void cudaConfigureCall(unsigned gridSize, unsigned blockSize); // expected-error {{must have scalar return type}}
+// legacy-launch-error@+1 {{must have scalar return type}}
+void cudaConfigureCall(unsigned gridSize, unsigned blockSize);
+// new-launch-error@+1 {{must have scalar return type}}
+void __cudaPushCallConfiguration(unsigned gridSize, unsigned blockSize);
Index: clang/test/SemaCUDA/Inputs/cuda.h
===
--- clang/test/SemaCUDA/Inputs/cuda.h
+++ clang/test/SemaCUDA/Inputs/cuda.h
@@ -18,9 +18,17 @@
 };
 
 typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
 
-int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
-  cudaStream_t stream = 0);
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+ size_t sharedSize = 0,
+ cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+   size_t sharedSize = 0,
+   cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+dim3 blockDim, void **args,
+size_t sharedMem, cudaStream_t stream);
 
 // Host- and device-side placement new overloads.
 void *operator new(__SIZE_TYPE__, void *p) { return p; }
Index: clang/test/Driver/cuda-simple.cu
===
--- clang/test/Driver/cuda-simple.cu
+++ clang/test/Driver/cuda-simple.cu
@@ -2,7 +2,7 @@
 // http://llvm.org/PR22936
 // RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s
 //
-// Verify that we pass -x cuda-cpp-output to compiler after 
+// Verify that we pass -x cuda-cpp-output to compiler after
 // preprocessing a CUDA file
 // RUN: %clang  -Werror -### -save-temps -c %s 2>&1 | FileCheck %s
 // CHECK: "-cc1"
@@ -14,7 +14,9 @@
 // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output".
 // RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s
 
-int cudaConfigureCall(int, int);
+extern "C" int cudaConfigureCall(int, int);
+extern "C" int __cudaPushCallConfiguration(int, int);
+
 __attribute__((global)) void kernel() {}
 
 void func() {
Index: clang/test/CodeGenCUDA/kernel-call.cu
===
--- clang/test/CodeGenCUDA/kernel-call.cu
+++ clang/test/CodeGenCUDA/kernel-call.cu
@@ -1,5 +1,9 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
-// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
+// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
+// RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
+// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
 
 
 #include "Inputs/cuda.h"
@@ -7,14 +11,17 @@
 // CHECK-LABEL: define{{.*}}g1
 // HIP: call{{.*}}hipSetupArgument
 // HIP: call{{.*}}hipLaunchByPtr
-// CUDA: call{{.*}}cudaSetupArgument
-// CUDA: call{{.*}}cudaLaunch
+// CUDA-OLD: call{{.*}}cudaSetupArgument
+// CUDA-OLD: call{{.*}}cudaLaunch
+// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
+// CUDA-NEW: call{{.*}}cudaLaunchKernel
 __global__ void g1(int x) {}
 
 // CHECK-LABEL: define{{.*}}main
 int main(void) {
   // HIP: call{{.*}}hipConfigureCall
-  // CUDA: call{{.*}}cudaConfigureCall
+  //