[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-22 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
yaxunl marked 3 inline comments as done.
Closed by commit rG04fb81674ed7: [CUDA][HIP] Externalize kernels with internal 
linkage (authored by yaxunl).
Herald added a project: clang.

Changed prior to commit:
  https://reviews.llvm.org/D124189?vs=424364=424600#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu

Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -40,6 +40,11 @@
 // RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
 // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
 
+// Check postfix for CUDA.
+
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
+// RUN:   -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -55,11 +60,12 @@
 // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 
-// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
 
 static __device__ int x;
 
@@ -73,8 +79,8 @@
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
 
Index: clang/test/CodeGenCUDA/managed-var.cu
===
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -1,5 +1,3 @@
-// REQUIRES: x86-registered-target, amdgpu-registered-target
-
 // RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
 // RUN:   -check-prefixes=COMMON,DEV,NORDC-D %s
@@ -52,15 +50,15 @@
 
 // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
-// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
 // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
-// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
 
-// POSTFIX:  @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used
Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
===
--- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
+++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
@@ -6,19 +6,53 @@
 // RUN:   -aux-triple amdgcn-amd-amdhsa 

[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-22 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 7 inline comments as done.
yaxunl added inline comments.



Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:1-2
 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,NORDC %s

tra wrote:
> This is odd -- the tests use `-x hip` and  `-triple nvptx`.
> 
> I think we need to change them  into HIP+amdgpu and CUDA +nvptx variants ans 
> we now have language-dependent behavior here and are interested in the 
> language/triple combinations that we do use in practice.
will change them to amdgcn and add CUDA variant when committing.



Comment at: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu:3
 // RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
 // RUN:   -emit-llvm -o - -x hip %s > %t.dev
 

tra wrote:
> We should have CUDA test variants here, too.
will add CUDA test when committing.



Comment at: clang/test/CodeGenCUDA/managed-var.cu:1
 // REQUIRES: x86-registered-target, amdgpu-registered-target
 

tra wrote:
> Tests above do not have REQUIRED. Is it needed here?
> 
No. I will remove it when committing.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-22 Thread Artem Belevich via Phabricator via cfe-commits
tra accepted this revision.
tra added a comment.
This revision is now accepted and ready to land.

LGTM overal, with few test nits.




Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

yaxunl wrote:
> tra wrote:
> > yaxunl wrote:
> > > tra wrote:
> > > > yaxunl wrote:
> > > > > tra wrote:
> > > > > > Perhaps we don't need to change the public AST API and plumb 
> > > > > > `IgnoreCUDAGlobalAttr` through.
> > > > > > We cold create CUDA-aware static version of 
> > > > > > `GetGVALinkageForCudaKernel` instead, which would call 
> > > > > > `adjustGVALinkageForExternalDefinitionKind(..., 
> > > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
> > > > > We could have a static function but it would be 
> > > > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the 
> > > > > linkage of the kernel assuming it has no `__global__` attribute.
> > > > > 
> > > > > If you think it is OK I can make the change.
> > > > No point making public what's of no use to anybody other than this 
> > > > particular instance.
> > > > 
> > > > To think of it, we don't even need a function and could just do 
> > > > ```
> > > >   if (D->hasAttr() ) {
> > > > bool OriginalKernelLinkage = 
> > > > adjustGVALinkageForExternalDefinitionKind(..., 
> > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true));
> > > > return OriginalKernelLinkage == GVA_Internal;
> > > >   }
> > > >   return (IsStaticVar &&)
> > > > ```
> > > > 
> > > > 
> > > One disadvantage of this approach is that it duplicates the code in 
> > > GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction 
> > > changes, the same change needs to be applied to the duplicated code, 
> > > which is error-prone.
> > Good point.  Looking at the code closer, it appears that what we're 
> > interested in is whether the kernel was internal and *became* externally 
> > visible due to it being a kernel. 
> > 
> > Right now we're checking if the function would normally be `GVA_Internal` 
> > (shouldn't we have considered GVA_DiscardableODR, too, BTW?)
> > This is a somewhat indirect way of figuring out what we really need.
> > 
> > The code that determines what we want is essentially this code in 
> > adjustGVALinkageForAttributes that we're trying to enable/disable with 
> > `ConsiderCudaGlobalAttr`. 
> > 
> > It can be easily extracted into a static function, which could then be used 
> > from both `adjustGVALinkageForAttributes`, (which would no longer need 
> > `ConsiderCudaGlobalAttr`) and from here. 
> > 
> > ```
> > bool isInternalKernel(ASTContext *Context, Decl *D) {
> >   L=basicGVALinkageForFunction(Context, D);
> >   return (D->hasAttr() &&
> >   (L == GVA_DiscardableODR || L == GVA_Internal));
> > }
> > ```
> > 
> > This would both avoid logic duplication and would better match our intent.
> > 
> > Does it make sense? Or did I miss something else?
> GVA_DiscardableODR  usually maps to linkonce_odr linkage in LLVM IR. It 
> follows the ODR, therefore we should not make them unique.
> 
> If we use isInternalKernel in adjustGVALinkageForAttributes, there will be 
> two calls of basicGVALinkageForFunction when GetGVALinkageForFunction is 
> called, which seems inefficient. I think we can keep GetGVALinkageForFunction 
> as it was, and use basicGVALinkageForFunction directly in mayExternalize.
SGTM.



Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:1-2
 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,NORDC %s

This is odd -- the tests use `-x hip` and  `-triple nvptx`.

I think we need to change them  into HIP+amdgpu and CUDA +nvptx variants ans we 
now have language-dependent behavior here and are interested in the 
language/triple combinations that we do use in practice.



Comment at: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu:3
 // RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
 // RUN:   -emit-llvm -o - -x hip %s > %t.dev
 

We should have CUDA test variants here, too.



Comment at: clang/test/CodeGenCUDA/managed-var.cu:1
 // REQUIRES: x86-registered-target, amdgpu-registered-target
 

Tests above do not have REQUIRED. Is it needed here?



CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 424364.
yaxunl added a comment.

use basicGVALinkageForFunction


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu

Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -40,6 +40,11 @@
 // RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
 // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
 
+// Check postfix for CUDA.
+
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
+// RUN:   -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -55,11 +60,12 @@
 // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 
-// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
 
 static __device__ int x;
 
@@ -73,8 +79,8 @@
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
 
Index: clang/test/CodeGenCUDA/managed-var.cu
===
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -52,15 +52,15 @@
 
 // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
-// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
 // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
-// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
 
-// POSTFIX:  @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used
Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
===
--- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
+++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
@@ -8,17 +8,38 @@
 
 // RUN: cat %t.dev %t.host | FileCheck %s
 
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix=CUDA %s
+
 #include "Inputs/cuda.h"
 
-// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]](
-// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00"
-// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]]
+// CHECK-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]](
+// CHECK-DAG: define weak_odr {{.*}}void 

[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

tra wrote:
> yaxunl wrote:
> > tra wrote:
> > > yaxunl wrote:
> > > > tra wrote:
> > > > > Perhaps we don't need to change the public AST API and plumb 
> > > > > `IgnoreCUDAGlobalAttr` through.
> > > > > We cold create CUDA-aware static version of 
> > > > > `GetGVALinkageForCudaKernel` instead, which would call 
> > > > > `adjustGVALinkageForExternalDefinitionKind(..., 
> > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
> > > > We could have a static function but it would be 
> > > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the 
> > > > linkage of the kernel assuming it has no `__global__` attribute.
> > > > 
> > > > If you think it is OK I can make the change.
> > > No point making public what's of no use to anybody other than this 
> > > particular instance.
> > > 
> > > To think of it, we don't even need a function and could just do 
> > > ```
> > >   if (D->hasAttr() ) {
> > > bool OriginalKernelLinkage = 
> > > adjustGVALinkageForExternalDefinitionKind(..., 
> > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true));
> > > return OriginalKernelLinkage == GVA_Internal;
> > >   }
> > >   return (IsStaticVar &&)
> > > ```
> > > 
> > > 
> > One disadvantage of this approach is that it duplicates the code in 
> > GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction 
> > changes, the same change needs to be applied to the duplicated code, which 
> > is error-prone.
> Good point.  Looking at the code closer, it appears that what we're 
> interested in is whether the kernel was internal and *became* externally 
> visible due to it being a kernel. 
> 
> Right now we're checking if the function would normally be `GVA_Internal` 
> (shouldn't we have considered GVA_DiscardableODR, too, BTW?)
> This is a somewhat indirect way of figuring out what we really need.
> 
> The code that determines what we want is essentially this code in 
> adjustGVALinkageForAttributes that we're trying to enable/disable with 
> `ConsiderCudaGlobalAttr`. 
> 
> It can be easily extracted into a static function, which could then be used 
> from both `adjustGVALinkageForAttributes`, (which would no longer need 
> `ConsiderCudaGlobalAttr`) and from here. 
> 
> ```
> bool isInternalKernel(ASTContext *Context, Decl *D) {
>   L=basicGVALinkageForFunction(Context, D);
>   return (D->hasAttr() &&
>   (L == GVA_DiscardableODR || L == GVA_Internal));
> }
> ```
> 
> This would both avoid logic duplication and would better match our intent.
> 
> Does it make sense? Or did I miss something else?
GVA_DiscardableODR  usually maps to linkonce_odr linkage in LLVM IR. It follows 
the ODR, therefore we should not make them unique.

If we use isInternalKernel in adjustGVALinkageForAttributes, there will be two 
calls of basicGVALinkageForFunction when GetGVALinkageForFunction is called, 
which seems inefficient. I think we can keep GetGVALinkageForFunction as it 
was, and use basicGVALinkageForFunction directly in mayExternalize.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

yaxunl wrote:
> tra wrote:
> > yaxunl wrote:
> > > tra wrote:
> > > > Perhaps we don't need to change the public AST API and plumb 
> > > > `IgnoreCUDAGlobalAttr` through.
> > > > We cold create CUDA-aware static version of 
> > > > `GetGVALinkageForCudaKernel` instead, which would call 
> > > > `adjustGVALinkageForExternalDefinitionKind(..., 
> > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
> > > We could have a static function but it would be 
> > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the 
> > > linkage of the kernel assuming it has no `__global__` attribute.
> > > 
> > > If you think it is OK I can make the change.
> > No point making public what's of no use to anybody other than this 
> > particular instance.
> > 
> > To think of it, we don't even need a function and could just do 
> > ```
> >   if (D->hasAttr() ) {
> > bool OriginalKernelLinkage = 
> > adjustGVALinkageForExternalDefinitionKind(..., 
> > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true));
> > return OriginalKernelLinkage == GVA_Internal;
> >   }
> >   return (IsStaticVar &&)
> > ```
> > 
> > 
> One disadvantage of this approach is that it duplicates the code in 
> GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction changes, 
> the same change needs to be applied to the duplicated code, which is 
> error-prone.
Good point.  Looking at the code closer, it appears that what we're interested 
in is whether the kernel was internal and *became* externally visible due to it 
being a kernel. 

Right now we're checking if the function would normally be `GVA_Internal` 
(shouldn't we have considered GVA_DiscardableODR, too, BTW?)
This is a somewhat indirect way of figuring out what we really need.

The code that determines what we want is essentially this code in 
adjustGVALinkageForAttributes that we're trying to enable/disable with 
`ConsiderCudaGlobalAttr`. 

It can be easily extracted into a static function, which could then be used 
from both `adjustGVALinkageForAttributes`, (which would no longer need 
`ConsiderCudaGlobalAttr`) and from here. 

```
bool isInternalKernel(ASTContext *Context, Decl *D) {
  L=basicGVALinkageForFunction(Context, D);
  return (D->hasAttr() &&
  (L == GVA_DiscardableODR || L == GVA_Internal));
}
```

This would both avoid logic duplication and would better match our intent.

Does it make sense? Or did I miss something else?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

tra wrote:
> yaxunl wrote:
> > tra wrote:
> > > Perhaps we don't need to change the public AST API and plumb 
> > > `IgnoreCUDAGlobalAttr` through.
> > > We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` 
> > > instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., 
> > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
> > We could have a static function but it would be 
> > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the 
> > linkage of the kernel assuming it has no `__global__` attribute.
> > 
> > If you think it is OK I can make the change.
> No point making public what's of no use to anybody other than this particular 
> instance.
> 
> To think of it, we don't even need a function and could just do 
> ```
>   if (D->hasAttr() ) {
> bool OriginalKernelLinkage = 
> adjustGVALinkageForExternalDefinitionKind(..., 
> adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true));
> return OriginalKernelLinkage == GVA_Internal;
>   }
>   return (IsStaticVar &&)
> ```
> 
> 
One disadvantage of this approach is that it duplicates the code in 
GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction changes, 
the same change needs to be applied to the duplicated code, which is 
error-prone.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

yaxunl wrote:
> tra wrote:
> > Perhaps we don't need to change the public AST API and plumb 
> > `IgnoreCUDAGlobalAttr` through.
> > We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` 
> > instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., 
> > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
> We could have a static function but it would be 
> GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the linkage 
> of the kernel assuming it has no `__global__` attribute.
> 
> If you think it is OK I can make the change.
No point making public what's of no use to anybody other than this particular 
instance.

To think of it, we don't even need a function and could just do 
```
  if (D->hasAttr() ) {
bool OriginalKernelLinkage = adjustGVALinkageForExternalDefinitionKind(..., 
adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true));
return OriginalKernelLinkage == GVA_Internal;
  }
  return (IsStaticVar &&)
```




CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 424280.
yaxunl marked an inline comment as done.
yaxunl added a comment.

use static function


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu

Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -40,6 +40,11 @@
 // RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
 // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
 
+// Check postfix for CUDA.
+
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
+// RUN:   -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -55,11 +60,12 @@
 // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 
-// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
 
 static __device__ int x;
 
@@ -73,8 +79,8 @@
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
 
Index: clang/test/CodeGenCUDA/managed-var.cu
===
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -52,15 +52,15 @@
 
 // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
-// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
 // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
-// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
 
-// POSTFIX:  @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used
Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
===
--- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
+++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
@@ -8,17 +8,38 @@
 
 // RUN: cat %t.dev %t.host | FileCheck %s
 
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix=CUDA %s
+
 #include "Inputs/cuda.h"
 
-// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]](
-// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00"
-// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]]
+// CHECK-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]](
+// CHECK-DAG: define weak_odr 

[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked an inline comment as done.
yaxunl added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:11322
+  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice 
&&
+ !IgnoreCUDAGlobalAttr) {
 // Device-side functions with __global__ attribute must always be

tra wrote:
> Nit: I'd phrase it as a positive assertion `ConsiderCudaGlobalAttr` and 
> default it to true.
> 
> `DontDoX` always trips me and gets me to question it -- "what *are* we doing 
> then? what else is there besides X?".
> With  a `DoX` things are usually simpler and limited to `X` -- we're either 
> doing X or not.
> 
will do



Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

tra wrote:
> Perhaps we don't need to change the public AST API and plumb 
> `IgnoreCUDAGlobalAttr` through.
> We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` 
> instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., 
> adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.
We could have a static function but it would be 
GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the linkage 
of the kernel assuming it has no `__global__` attribute.

If you think it is OK I can make the change.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/AST/ASTContext.cpp:11322
+  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice 
&&
+ !IgnoreCUDAGlobalAttr) {
 // Device-side functions with __global__ attribute must always be

Nit: I'd phrase it as a positive assertion `ConsiderCudaGlobalAttr` and default 
it to true.

`DontDoX` always trips me and gets me to question it -- "what *are* we doing 
then? what else is there besides X?".
With  a `DoX` things are usually simpler and limited to `X` -- we're either 
doing X or not.




Comment at: clang/lib/AST/ASTContext.cpp:12300
+ (D->hasAttr() &&
+  GetGVALinkageForFunction(cast(D),
+   /*IgnoreCUDAGlobalAttr=*/true) ==

Perhaps we don't need to change the public AST API and plumb 
`IgnoreCUDAGlobalAttr` through.
We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` 
instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., 
adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D124189/new/

https://reviews.llvm.org/D124189

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage

2022-04-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added a reviewer: tra.
Herald added a subscriber: mattd.
Herald added a project: All.
yaxunl requested review of this revision.

This patch is a continuation of https://reviews.llvm.org/D123353.

Not only kernels in anonymous namespace, but also template
kernels with template arguments in anonymous namespace
need to be externalized.

To be more generic, this patch checks the linkage of a kernel
assuming the kernel does not have `__global__` attribute. If
the linkage is internal then clang will externalize it.


https://reviews.llvm.org/D124189

Files:
  clang/include/clang/AST/ASTContext.h
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
  clang/test/CodeGenCUDA/managed-var.cu
  clang/test/CodeGenCUDA/static-device-var-rdc.cu

Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu
===
--- clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -40,6 +40,11 @@
 // RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
 // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
 
+// Check postfix for CUDA.
+
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
+// RUN:   -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -55,11 +60,12 @@
 // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 
-// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
 
 static __device__ int x;
 
@@ -73,8 +79,8 @@
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
 
Index: clang/test/CodeGenCUDA/managed-var.cu
===
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -52,15 +52,15 @@
 
 // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
-// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
 // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
-// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
 
-// POSTFIX:  @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used
Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
===
--- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
+++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
@@ -8,17 +8,38 @@
 
 // RUN: cat %t.dev %t.host | FileCheck %s
 
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
+// RUN:   -emit-llvm -o - %s | FileCheck