[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. yaxunl marked 3 inline comments as done. Closed by commit rG04fb81674ed7: [CUDA][HIP] Externalize kernels with internal linkage (authored by yaxunl). Herald added a project: clang. Changed prior to commit: https://reviews.llvm.org/D124189?vs=424364=424600#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 Files: clang/lib/AST/ASTContext.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenCUDA/device-var-linkage.cu clang/test/CodeGenCUDA/kernel-in-anon-ns.cu clang/test/CodeGenCUDA/managed-var.cu clang/test/CodeGenCUDA/static-device-var-rdc.cu Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu === --- clang/test/CodeGenCUDA/static-device-var-rdc.cu +++ clang/test/CodeGenCUDA/static-device-var-rdc.cu @@ -40,6 +40,11 @@ // RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s +// Check postfix for CUDA. + +// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \ +// RUN: -check-prefixes=CUDA %s #include "Inputs/cuda.h" @@ -55,11 +60,12 @@ // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00" +// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00" +// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00" +// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00" static __device__ int x; @@ -73,8 +79,8 @@ // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00" +// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00" static __constant__ int y; Index: clang/test/CodeGenCUDA/managed-var.cu === --- clang/test/CodeGenCUDA/managed-var.cu +++ clang/test/CodeGenCUDA/managed-var.cu @@ -1,5 +1,3 @@ -// REQUIRES: x86-registered-target, amdgpu-registered-target - // RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ // RUN: -check-prefixes=COMMON,DEV,NORDC-D %s @@ -52,15 +50,15 @@ // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null -// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 -// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 +// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null // HOST-DAG: @_ZL2sx.managed = internal global i32 1 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" -// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00" +// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" -// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null -// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00" +// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00" static __managed__ int sx = 1; // DEV-DAG: @llvm.compiler.used Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu === --- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu +++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu @@ -6,19 +6,53 @@ // RUN: -aux-triple amdgcn-amd-amdhsa
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl marked 7 inline comments as done. yaxunl added inline comments. Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:1-2 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \ // RUN: -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefixes=DEV,NORDC %s tra wrote: > This is odd -- the tests use `-x hip` and `-triple nvptx`. > > I think we need to change them into HIP+amdgpu and CUDA +nvptx variants ans > we now have language-dependent behavior here and are interested in the > language/triple combinations that we do use in practice. will change them to amdgcn and add CUDA variant when committing. Comment at: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu:3 // RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \ // RUN: -emit-llvm -o - -x hip %s > %t.dev tra wrote: > We should have CUDA test variants here, too. will add CUDA test when committing. Comment at: clang/test/CodeGenCUDA/managed-var.cu:1 // REQUIRES: x86-registered-target, amdgpu-registered-target tra wrote: > Tests above do not have REQUIRED. Is it needed here? > No. I will remove it when committing. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
tra accepted this revision. tra added a comment. This revision is now accepted and ready to land. LGTM overal, with few test nits. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == yaxunl wrote: > tra wrote: > > yaxunl wrote: > > > tra wrote: > > > > yaxunl wrote: > > > > > tra wrote: > > > > > > Perhaps we don't need to change the public AST API and plumb > > > > > > `IgnoreCUDAGlobalAttr` through. > > > > > > We cold create CUDA-aware static version of > > > > > > `GetGVALinkageForCudaKernel` instead, which would call > > > > > > `adjustGVALinkageForExternalDefinitionKind(..., > > > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. > > > > > We could have a static function but it would be > > > > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the > > > > > linkage of the kernel assuming it has no `__global__` attribute. > > > > > > > > > > If you think it is OK I can make the change. > > > > No point making public what's of no use to anybody other than this > > > > particular instance. > > > > > > > > To think of it, we don't even need a function and could just do > > > > ``` > > > > if (D->hasAttr() ) { > > > > bool OriginalKernelLinkage = > > > > adjustGVALinkageForExternalDefinitionKind(..., > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true)); > > > > return OriginalKernelLinkage == GVA_Internal; > > > > } > > > > return (IsStaticVar &&) > > > > ``` > > > > > > > > > > > One disadvantage of this approach is that it duplicates the code in > > > GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction > > > changes, the same change needs to be applied to the duplicated code, > > > which is error-prone. > > Good point. Looking at the code closer, it appears that what we're > > interested in is whether the kernel was internal and *became* externally > > visible due to it being a kernel. > > > > Right now we're checking if the function would normally be `GVA_Internal` > > (shouldn't we have considered GVA_DiscardableODR, too, BTW?) > > This is a somewhat indirect way of figuring out what we really need. > > > > The code that determines what we want is essentially this code in > > adjustGVALinkageForAttributes that we're trying to enable/disable with > > `ConsiderCudaGlobalAttr`. > > > > It can be easily extracted into a static function, which could then be used > > from both `adjustGVALinkageForAttributes`, (which would no longer need > > `ConsiderCudaGlobalAttr`) and from here. > > > > ``` > > bool isInternalKernel(ASTContext *Context, Decl *D) { > > L=basicGVALinkageForFunction(Context, D); > > return (D->hasAttr() && > > (L == GVA_DiscardableODR || L == GVA_Internal)); > > } > > ``` > > > > This would both avoid logic duplication and would better match our intent. > > > > Does it make sense? Or did I miss something else? > GVA_DiscardableODR usually maps to linkonce_odr linkage in LLVM IR. It > follows the ODR, therefore we should not make them unique. > > If we use isInternalKernel in adjustGVALinkageForAttributes, there will be > two calls of basicGVALinkageForFunction when GetGVALinkageForFunction is > called, which seems inefficient. I think we can keep GetGVALinkageForFunction > as it was, and use basicGVALinkageForFunction directly in mayExternalize. SGTM. Comment at: clang/test/CodeGenCUDA/device-var-linkage.cu:1-2 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \ // RUN: -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefixes=DEV,NORDC %s This is odd -- the tests use `-x hip` and `-triple nvptx`. I think we need to change them into HIP+amdgpu and CUDA +nvptx variants ans we now have language-dependent behavior here and are interested in the language/triple combinations that we do use in practice. Comment at: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu:3 // RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \ // RUN: -emit-llvm -o - -x hip %s > %t.dev We should have CUDA test variants here, too. Comment at: clang/test/CodeGenCUDA/managed-var.cu:1 // REQUIRES: x86-registered-target, amdgpu-registered-target Tests above do not have REQUIRED. Is it needed here? CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl updated this revision to Diff 424364. yaxunl added a comment. use basicGVALinkageForFunction CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 Files: clang/lib/AST/ASTContext.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenCUDA/device-var-linkage.cu clang/test/CodeGenCUDA/kernel-in-anon-ns.cu clang/test/CodeGenCUDA/managed-var.cu clang/test/CodeGenCUDA/static-device-var-rdc.cu Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu === --- clang/test/CodeGenCUDA/static-device-var-rdc.cu +++ clang/test/CodeGenCUDA/static-device-var-rdc.cu @@ -40,6 +40,11 @@ // RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s +// Check postfix for CUDA. + +// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \ +// RUN: -check-prefixes=CUDA %s #include "Inputs/cuda.h" @@ -55,11 +60,12 @@ // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00" +// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00" +// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00" +// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00" static __device__ int x; @@ -73,8 +79,8 @@ // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00" +// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00" static __constant__ int y; Index: clang/test/CodeGenCUDA/managed-var.cu === --- clang/test/CodeGenCUDA/managed-var.cu +++ clang/test/CodeGenCUDA/managed-var.cu @@ -52,15 +52,15 @@ // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null -// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 -// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 +// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null // HOST-DAG: @_ZL2sx.managed = internal global i32 1 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" -// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00" +// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" -// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null -// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00" +// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00" static __managed__ int sx = 1; // DEV-DAG: @llvm.compiler.used Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu === --- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu +++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu @@ -8,17 +8,38 @@ // RUN: cat %t.dev %t.host | FileCheck %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CUDA %s + #include "Inputs/cuda.h" -// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]]( -// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00" -// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]] +// CHECK-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]]( +// CHECK-DAG: define weak_odr {{.*}}void
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == tra wrote: > yaxunl wrote: > > tra wrote: > > > yaxunl wrote: > > > > tra wrote: > > > > > Perhaps we don't need to change the public AST API and plumb > > > > > `IgnoreCUDAGlobalAttr` through. > > > > > We cold create CUDA-aware static version of > > > > > `GetGVALinkageForCudaKernel` instead, which would call > > > > > `adjustGVALinkageForExternalDefinitionKind(..., > > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. > > > > We could have a static function but it would be > > > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the > > > > linkage of the kernel assuming it has no `__global__` attribute. > > > > > > > > If you think it is OK I can make the change. > > > No point making public what's of no use to anybody other than this > > > particular instance. > > > > > > To think of it, we don't even need a function and could just do > > > ``` > > > if (D->hasAttr() ) { > > > bool OriginalKernelLinkage = > > > adjustGVALinkageForExternalDefinitionKind(..., > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true)); > > > return OriginalKernelLinkage == GVA_Internal; > > > } > > > return (IsStaticVar &&) > > > ``` > > > > > > > > One disadvantage of this approach is that it duplicates the code in > > GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction > > changes, the same change needs to be applied to the duplicated code, which > > is error-prone. > Good point. Looking at the code closer, it appears that what we're > interested in is whether the kernel was internal and *became* externally > visible due to it being a kernel. > > Right now we're checking if the function would normally be `GVA_Internal` > (shouldn't we have considered GVA_DiscardableODR, too, BTW?) > This is a somewhat indirect way of figuring out what we really need. > > The code that determines what we want is essentially this code in > adjustGVALinkageForAttributes that we're trying to enable/disable with > `ConsiderCudaGlobalAttr`. > > It can be easily extracted into a static function, which could then be used > from both `adjustGVALinkageForAttributes`, (which would no longer need > `ConsiderCudaGlobalAttr`) and from here. > > ``` > bool isInternalKernel(ASTContext *Context, Decl *D) { > L=basicGVALinkageForFunction(Context, D); > return (D->hasAttr() && > (L == GVA_DiscardableODR || L == GVA_Internal)); > } > ``` > > This would both avoid logic duplication and would better match our intent. > > Does it make sense? Or did I miss something else? GVA_DiscardableODR usually maps to linkonce_odr linkage in LLVM IR. It follows the ODR, therefore we should not make them unique. If we use isInternalKernel in adjustGVALinkageForAttributes, there will be two calls of basicGVALinkageForFunction when GetGVALinkageForFunction is called, which seems inefficient. I think we can keep GetGVALinkageForFunction as it was, and use basicGVALinkageForFunction directly in mayExternalize. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
tra added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == yaxunl wrote: > tra wrote: > > yaxunl wrote: > > > tra wrote: > > > > Perhaps we don't need to change the public AST API and plumb > > > > `IgnoreCUDAGlobalAttr` through. > > > > We cold create CUDA-aware static version of > > > > `GetGVALinkageForCudaKernel` instead, which would call > > > > `adjustGVALinkageForExternalDefinitionKind(..., > > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. > > > We could have a static function but it would be > > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the > > > linkage of the kernel assuming it has no `__global__` attribute. > > > > > > If you think it is OK I can make the change. > > No point making public what's of no use to anybody other than this > > particular instance. > > > > To think of it, we don't even need a function and could just do > > ``` > > if (D->hasAttr() ) { > > bool OriginalKernelLinkage = > > adjustGVALinkageForExternalDefinitionKind(..., > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true)); > > return OriginalKernelLinkage == GVA_Internal; > > } > > return (IsStaticVar &&) > > ``` > > > > > One disadvantage of this approach is that it duplicates the code in > GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction changes, > the same change needs to be applied to the duplicated code, which is > error-prone. Good point. Looking at the code closer, it appears that what we're interested in is whether the kernel was internal and *became* externally visible due to it being a kernel. Right now we're checking if the function would normally be `GVA_Internal` (shouldn't we have considered GVA_DiscardableODR, too, BTW?) This is a somewhat indirect way of figuring out what we really need. The code that determines what we want is essentially this code in adjustGVALinkageForAttributes that we're trying to enable/disable with `ConsiderCudaGlobalAttr`. It can be easily extracted into a static function, which could then be used from both `adjustGVALinkageForAttributes`, (which would no longer need `ConsiderCudaGlobalAttr`) and from here. ``` bool isInternalKernel(ASTContext *Context, Decl *D) { L=basicGVALinkageForFunction(Context, D); return (D->hasAttr() && (L == GVA_DiscardableODR || L == GVA_Internal)); } ``` This would both avoid logic duplication and would better match our intent. Does it make sense? Or did I miss something else? CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == tra wrote: > yaxunl wrote: > > tra wrote: > > > Perhaps we don't need to change the public AST API and plumb > > > `IgnoreCUDAGlobalAttr` through. > > > We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` > > > instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., > > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. > > We could have a static function but it would be > > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the > > linkage of the kernel assuming it has no `__global__` attribute. > > > > If you think it is OK I can make the change. > No point making public what's of no use to anybody other than this particular > instance. > > To think of it, we don't even need a function and could just do > ``` > if (D->hasAttr() ) { > bool OriginalKernelLinkage = > adjustGVALinkageForExternalDefinitionKind(..., > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true)); > return OriginalKernelLinkage == GVA_Internal; > } > return (IsStaticVar &&) > ``` > > One disadvantage of this approach is that it duplicates the code in GetGVALinkageForFunction. In the future, if GetGVALinkageForFunction changes, the same change needs to be applied to the duplicated code, which is error-prone. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
tra added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == yaxunl wrote: > tra wrote: > > Perhaps we don't need to change the public AST API and plumb > > `IgnoreCUDAGlobalAttr` through. > > We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` > > instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., > > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. > We could have a static function but it would be > GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the linkage > of the kernel assuming it has no `__global__` attribute. > > If you think it is OK I can make the change. No point making public what's of no use to anybody other than this particular instance. To think of it, we don't even need a function and could just do ``` if (D->hasAttr() ) { bool OriginalKernelLinkage = adjustGVALinkageForExternalDefinitionKind(..., adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true)); return OriginalKernelLinkage == GVA_Internal; } return (IsStaticVar &&) ``` CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl updated this revision to Diff 424280. yaxunl marked an inline comment as done. yaxunl added a comment. use static function CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 Files: clang/lib/AST/ASTContext.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenCUDA/device-var-linkage.cu clang/test/CodeGenCUDA/kernel-in-anon-ns.cu clang/test/CodeGenCUDA/managed-var.cu clang/test/CodeGenCUDA/static-device-var-rdc.cu Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu === --- clang/test/CodeGenCUDA/static-device-var-rdc.cu +++ clang/test/CodeGenCUDA/static-device-var-rdc.cu @@ -40,6 +40,11 @@ // RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s +// Check postfix for CUDA. + +// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \ +// RUN: -check-prefixes=CUDA %s #include "Inputs/cuda.h" @@ -55,11 +60,12 @@ // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00" +// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00" +// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00" +// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00" static __device__ int x; @@ -73,8 +79,8 @@ // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00" +// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00" static __constant__ int y; Index: clang/test/CodeGenCUDA/managed-var.cu === --- clang/test/CodeGenCUDA/managed-var.cu +++ clang/test/CodeGenCUDA/managed-var.cu @@ -52,15 +52,15 @@ // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null -// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 -// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 +// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null // HOST-DAG: @_ZL2sx.managed = internal global i32 1 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" -// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00" +// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" -// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null -// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00" +// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00" static __managed__ int sx = 1; // DEV-DAG: @llvm.compiler.used Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu === --- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu +++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu @@ -8,17 +8,38 @@ // RUN: cat %t.dev %t.host | FileCheck %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CUDA %s + #include "Inputs/cuda.h" -// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]]( -// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00" -// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]] +// CHECK-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]]( +// CHECK-DAG: define weak_odr
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl marked an inline comment as done. yaxunl added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:11322 + } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice && + !IgnoreCUDAGlobalAttr) { // Device-side functions with __global__ attribute must always be tra wrote: > Nit: I'd phrase it as a positive assertion `ConsiderCudaGlobalAttr` and > default it to true. > > `DontDoX` always trips me and gets me to question it -- "what *are* we doing > then? what else is there besides X?". > With a `DoX` things are usually simpler and limited to `X` -- we're either > doing X or not. > will do Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == tra wrote: > Perhaps we don't need to change the public AST API and plumb > `IgnoreCUDAGlobalAttr` through. > We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` > instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., > adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. We could have a static function but it would be GetGVALinkageForCUDAKernelWithoutGlobalAttr since we need to know the linkage of the kernel assuming it has no `__global__` attribute. If you think it is OK I can make the change. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
tra added inline comments. Comment at: clang/lib/AST/ASTContext.cpp:11322 + } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice && + !IgnoreCUDAGlobalAttr) { // Device-side functions with __global__ attribute must always be Nit: I'd phrase it as a positive assertion `ConsiderCudaGlobalAttr` and default it to true. `DontDoX` always trips me and gets me to question it -- "what *are* we doing then? what else is there besides X?". With a `DoX` things are usually simpler and limited to `X` -- we're either doing X or not. Comment at: clang/lib/AST/ASTContext.cpp:12300 + (D->hasAttr() && + GetGVALinkageForFunction(cast(D), + /*IgnoreCUDAGlobalAttr=*/true) == Perhaps we don't need to change the public AST API and plumb `IgnoreCUDAGlobalAttr` through. We cold create CUDA-aware static version of `GetGVALinkageForCudaKernel` instead, which would call `adjustGVALinkageForExternalDefinitionKind(..., adjustGVALinkageForAttributes(IgnoreCUDAGlobalAttr=true))`. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D124189/new/ https://reviews.llvm.org/D124189 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D124189: [CUDA][HIP] Externalize kernels with internal linkage
yaxunl created this revision. yaxunl added a reviewer: tra. Herald added a subscriber: mattd. Herald added a project: All. yaxunl requested review of this revision. This patch is a continuation of https://reviews.llvm.org/D123353. Not only kernels in anonymous namespace, but also template kernels with template arguments in anonymous namespace need to be externalized. To be more generic, this patch checks the linkage of a kernel assuming the kernel does not have `__global__` attribute. If the linkage is internal then clang will externalize it. https://reviews.llvm.org/D124189 Files: clang/include/clang/AST/ASTContext.h clang/lib/AST/ASTContext.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenCUDA/device-var-linkage.cu clang/test/CodeGenCUDA/kernel-in-anon-ns.cu clang/test/CodeGenCUDA/managed-var.cu clang/test/CodeGenCUDA/static-device-var-rdc.cu Index: clang/test/CodeGenCUDA/static-device-var-rdc.cu === --- clang/test/CodeGenCUDA/static-device-var-rdc.cu +++ clang/test/CodeGenCUDA/static-device-var-rdc.cu @@ -40,6 +40,11 @@ // RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s +// Check postfix for CUDA. + +// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \ +// RUN: -check-prefixes=CUDA %s #include "Inputs/cuda.h" @@ -55,11 +60,12 @@ // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00" +// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00" +// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 -// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00" +// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00" static __device__ int x; @@ -73,8 +79,8 @@ // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00" // Test externalized static device variables -// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0 -// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00" +// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0 +// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00" static __constant__ int y; Index: clang/test/CodeGenCUDA/managed-var.cu === --- clang/test/CodeGenCUDA/managed-var.cu +++ clang/test/CodeGenCUDA/managed-var.cu @@ -52,15 +52,15 @@ // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null -// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 -// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 +// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null // HOST-DAG: @_ZL2sx.managed = internal global i32 1 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" -// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00" +// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" -// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null -// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00" +// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null +// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00" static __managed__ int sx = 1; // DEV-DAG: @llvm.compiler.used Index: clang/test/CodeGenCUDA/kernel-in-anon-ns.cu === --- clang/test/CodeGenCUDA/kernel-in-anon-ns.cu +++ clang/test/CodeGenCUDA/kernel-in-anon-ns.cu @@ -8,17 +8,38 @@ // RUN: cat %t.dev %t.host | FileCheck %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \ +// RUN: -emit-llvm -o - %s | FileCheck