MaskRay created this revision.
MaskRay added reviewers: jlebar, tra.
Herald added subscribers: cfe-commits, yaxunl.
Herald added a project: clang.
MaskRay requested review of this revision.

Host-side shadow variables of external declarations of device-side global
variables have internal linkage and are referenced by `__cuda_register_globals`.

If such a variable has a comdat group (e.g. a C++17 inline variable), we need to
suppress the comdat group when the linkage is changed to internal (which
represents a local symbol), because:

- the copy in this translation unit can be discarded (a copy from another 
translation unit is picked)
- accessing a discarded local symbol from outside the section group is 
disallowed by the ELF specification:

> A symbol table entry with STB_LOCAL binding that is defined relative to one 
> of a group's sections, and that is contained in a symbol table section that 
> is not part of the group, must be discarded if the group members are 
> discarded. References to this symbol table entry from outside the group are 
> not allowed.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D88786

Files:
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-stub.cu


Index: clang/test/CodeGenCUDA/device-stub.cu
===================================================================
--- clang/test/CodeGenCUDA/device-stub.cu
+++ clang/test/CodeGenCUDA/device-stub.cu
@@ -29,6 +29,10 @@
 // RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - 
\
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
 // RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
+// RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - 
\
+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
+// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW,LNX_17
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -target-sdk-version=9.2 -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@@ -91,9 +95,17 @@
 // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
 __constant__ int ext_constant_var_def = 2;
 
+#if __cplusplus > 201402L
+/// The local symbol _ZN1C10inline_varE cannot have a comdat, because it is
+/// referenced from a section (__cuda_module_ctor's section) outside the 
section group.
+// LNX_17: @_ZN1C10inline_varE = internal constant i32 undef, align 4{{$}}
+struct C {
+  __device__ static constexpr int inline_var = 17;
+};
+#endif
 
 void use_pointers() {
-  int *p;
+  const int *p;
   p = &device_var;
   p = &constant_var;
   p = &shared_var;
@@ -101,6 +113,9 @@
   p = &ext_device_var;
   p = &ext_constant_var;
   p = &ext_host_var;
+#if __cplusplus > 201402L
+  p = &C::inline_var;
+#endif
 }
 
 // Make sure that all parts of GPU code init/cleanup are there:
@@ -185,6 +200,7 @@
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, 
{{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, {{i32|i64}} 4, i32 1, 
i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, 
{{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, {{i32|i64}} 4, i32 
0, i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, 
{{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, {{i32|i64}} 4, 
i32 1, i32 0
+// LNX_17:  call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, 
{{.*}}@_ZN1C10inline_varE{{[^,]*}}, {{[^@]*}}@5, {{.*}}i32 0, {{i32|i64}} 4, 
i32 0, i32 0
 // ALL: ret void
 
 // Test that we've built a constructor.
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -4114,6 +4114,7 @@
   // Is accessible from all the threads within the grid and from the host
   // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
   // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
+  bool CudaModuleCtorReferenced = false;
   if (GV && LangOpts.CUDA) {
     if (LangOpts.CUDAIsDevice) {
       if (Linkage != llvm::GlobalValue::InternalLinkage &&
@@ -4128,10 +4129,16 @@
         Linkage = llvm::GlobalValue::InternalLinkage;
         // Shadow variables and their properties must be registered with CUDA
         // runtime. Skip Extern global variables, which will be registered in
-        // the TU where they are defined.
-        if (!D->hasExternalStorage())
+        // the TU where they are defined. The variable cannot be placed in a
+        // comdat, because the copy in this translation unit can be discarded
+        // and referencing a discarded local symbol from outside the comdat
+        // (__cuda_module_ctor is in a different section) is disallowed by the
+        // ELF spec.
+        if (!D->hasExternalStorage()) {
           getCUDARuntime().registerDeviceVar(D, *GV, !D->hasDefinition(),
                                              D->hasAttr<CUDAConstantAttr>());
+          CudaModuleCtorReferenced = true;
+        }
       } else if (D->hasAttr<CUDASharedAttr>()) {
         // __shared__ variables are odd. Shadows do get created, but
         // they are not registered with the CUDA runtime, so they
@@ -4235,7 +4242,8 @@
     setTLSMode(GV, *D);
   }
 
-  maybeSetTrivialComdat(*D, *GV);
+  if (!CudaModuleCtorReferenced)
+    maybeSetTrivialComdat(*D, *GV);
 
   // Emit the initializer function if necessary.
   if (NeedsGlobalCtor || NeedsGlobalDtor)


Index: clang/test/CodeGenCUDA/device-stub.cu
===================================================================
--- clang/test/CodeGenCUDA/device-stub.cu
+++ clang/test/CodeGenCUDA/device-stub.cu
@@ -29,6 +29,10 @@
 // RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
 // RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
+// RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
+// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW,LNX_17
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -target-sdk-version=9.2 -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@@ -91,9 +95,17 @@
 // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
 __constant__ int ext_constant_var_def = 2;
 
+#if __cplusplus > 201402L
+/// The local symbol _ZN1C10inline_varE cannot have a comdat, because it is
+/// referenced from a section (__cuda_module_ctor's section) outside the section group.
+// LNX_17: @_ZN1C10inline_varE = internal constant i32 undef, align 4{{$}}
+struct C {
+  __device__ static constexpr int inline_var = 17;
+};
+#endif
 
 void use_pointers() {
-  int *p;
+  const int *p;
   p = &device_var;
   p = &constant_var;
   p = &shared_var;
@@ -101,6 +113,9 @@
   p = &ext_device_var;
   p = &ext_constant_var;
   p = &ext_host_var;
+#if __cplusplus > 201402L
+  p = &C::inline_var;
+#endif
 }
 
 // Make sure that all parts of GPU code init/cleanup are there:
@@ -185,6 +200,7 @@
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0
 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0
+// LNX_17:  call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}@_ZN1C10inline_varE{{[^,]*}}, {{[^@]*}}@5, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0
 // ALL: ret void
 
 // Test that we've built a constructor.
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -4114,6 +4114,7 @@
   // Is accessible from all the threads within the grid and from the host
   // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
   // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
+  bool CudaModuleCtorReferenced = false;
   if (GV && LangOpts.CUDA) {
     if (LangOpts.CUDAIsDevice) {
       if (Linkage != llvm::GlobalValue::InternalLinkage &&
@@ -4128,10 +4129,16 @@
         Linkage = llvm::GlobalValue::InternalLinkage;
         // Shadow variables and their properties must be registered with CUDA
         // runtime. Skip Extern global variables, which will be registered in
-        // the TU where they are defined.
-        if (!D->hasExternalStorage())
+        // the TU where they are defined. The variable cannot be placed in a
+        // comdat, because the copy in this translation unit can be discarded
+        // and referencing a discarded local symbol from outside the comdat
+        // (__cuda_module_ctor is in a different section) is disallowed by the
+        // ELF spec.
+        if (!D->hasExternalStorage()) {
           getCUDARuntime().registerDeviceVar(D, *GV, !D->hasDefinition(),
                                              D->hasAttr<CUDAConstantAttr>());
+          CudaModuleCtorReferenced = true;
+        }
       } else if (D->hasAttr<CUDASharedAttr>()) {
         // __shared__ variables are odd. Shadows do get created, but
         // they are not registered with the CUDA runtime, so they
@@ -4235,7 +4242,8 @@
     setTLSMode(GV, *D);
   }
 
-  maybeSetTrivialComdat(*D, *GV);
+  if (!CudaModuleCtorReferenced)
+    maybeSetTrivialComdat(*D, *GV);
 
   // Emit the initializer function if necessary.
   if (NeedsGlobalCtor || NeedsGlobalDtor)
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D88786: [CUDA] Suppre... Fangrui Song via Phabricator via cfe-commits

Reply via email to