yaxunl created this revision.
yaxunl added a reviewer: tra.
yaxunl requested review of this revision.

For -fgpu-rdc, shadow variables should not be internalized, otherwise
they cannot be accessed by other TUs. This is necessary because
the shadow variable of external device variables are always
emitted as undefined symbols, which need to resolve to a global
symbols.

Managed variables need to be emitted as undefined symbols
in device compilations.


https://reviews.llvm.org/D95901

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/managed-var.cu

Index: clang/test/CodeGenCUDA/managed-var.cu
===================================================================
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -10,17 +10,19 @@
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,NORDC %s
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,RDC %s
 
 #include "Inputs/cuda.h"
 
-// DEV-DAG: @x = {{.*}}addrspace(1) externally_initialized global i32 undef
-// HOST-DAG: @x = internal global i32 1
-// HOST-DAG: @x.managed = internal global i32* null
+// DEV-DAG: @x = external addrspace(1) externally_initialized global i32
+// NORDC-DAG: @x = internal global i32 1
+// RDC-DAG: @x = dso_local global i32 1
+// NORDC-DAG: @x.managed = internal global i32* null
+// RDC-DAG: @x.managed = dso_local global i32* null
 // HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
 
 struct vec {
@@ -31,11 +33,28 @@
 __managed__ vec v[100];
 __managed__ vec v2[100] = {{1, 1, 1}};
 
+// DEV-DAG: @ex = external addrspace(1) global i32
+// HOST-DAG: @ex = external global i32
+extern __managed__ int ex;
+
+// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL2sx = internal global i32 1
+// HOST-DAG: @_ZL2sx.managed = internal global i32* null
+static __managed__ int sx = 1;
+
+// HOST-NOT: @ex.managed
+
+// Force ex and sx mitted in device compilation.
 __global__ void foo(int *z) {
-  *z = x;
+  *z = x + ex + sx;
   v[1].x = 2;
 }
 
+// Force ex and sx emitted in host compilatioin.
+int foo2() {
+  return ex + sx;
+}
+
 // HOST-LABEL: define {{.*}}@_Z4loadv()
 // HOST:  %ld.managed = load i32*, i32** @x.managed, align 4
 // HOST:  %0 = load i32, i32* %ld.managed, align 4
@@ -97,4 +116,6 @@
 }
 
 // HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
+// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx
+// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex
 // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)
Index: clang/test/CodeGenCUDA/device-var-linkage.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,RDC-H %s
+
+#include "Inputs/cuda.h"
+
+// DEV-DAG: @v1 = dso_local addrspace(1) externally_initialized global i32 0
+// NORDC-H-DAG: @v1 = internal global i32 undef
+// RDC-H-DAG: @v1 = dso_local global i32 undef
+__device__ int v1;
+// DEV-DAG: @v2 = dso_local addrspace(4) externally_initialized global i32 0
+// NORDC-H-DAG: @v2 = internal global i32 undef
+// RDC-H-DAG: @v2 = dso_local global i32 undef
+__constant__ int v2;
+// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32
+// NORDC-H-DAG: @v3 = internal global i32 0
+// RDC-H-DAG: @v3 = dso_local global i32 0
+__managed__ int v3;
+
+// DEV-DAG: @ev1 = external addrspace(1) global i32
+// HOST-DAG: @ev1 = external global i32
+extern __device__ int ev1;
+// DEV-DAG: @ev2 = external addrspace(4) global i32
+// HOST-DAG: @ev2 = external global i32
+extern __constant__ int ev2;
+// DEV-DAG: @ev3 = external addrspace(1) global i32
+// HOST-DAG: @ev3 = external global i32
+extern __managed__ int ev3;
+
+// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// HOST-DAG: @_ZL3sv1 = internal global i32 undef
+static __device__ int sv1;
+// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
+// HOST-DAG: @_ZL3sv2 = internal global i32 undef
+static __constant__ int sv2;
+// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL3sv3 = internal global i32 0
+static __managed__ int sv3;
+
+__device__ __host__ int work(int *x);
+
+__device__ __host__ int fun1() {
+  return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
+}
+
+// HOST: hipRegisterVar({{.*}}@v1
+// HOST: hipRegisterVar({{.*}}@v2
+// HOST: hipRegisterManagedVar({{.*}}@v3
+// HOST-NOT: hipRegisterVar({{.*}}@ev1
+// HOST-NOT: hipRegisterVar({{.*}}@ev2
+// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HOST: hipRegisterVar({{.*}}@_ZL3sv1
+// HOST: hipRegisterVar({{.*}}@_ZL3sv2
+// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
Index: clang/test/CodeGenCUDA/device-stub.cu
===================================================================
--- clang/test/CodeGenCUDA/device-stub.cu
+++ clang/test/CodeGenCUDA/device-stub.cu
@@ -30,9 +30,13 @@
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
 // RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
+// RUN:     -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
+// RUN:       --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW,LNX_17,NORDC17
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
 // RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
-// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17
+// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17,RDC17
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -target-sdk-version=9.2 -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@@ -45,7 +49,7 @@
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
 
@@ -56,15 +60,18 @@
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// LNX-DAG: @device_var = internal global i32
+// NORDC-DAG: @device_var = internal global i32
+// RDC-DAG: @device_var = dso_local global i32
 // WIN-DAG: @"?device_var@@3HA" = internal global i32
 __device__ int device_var;
 
-// LNX-DAG: @constant_var = internal global i32
+// NORDC-DAG: @constant_var = internal global i32
+// RDC-DAG: @constant_var = dso_local global i32
 // WIN-DAG: @"?constant_var@@3HA" = internal global i32
 __constant__ int constant_var;
 
-// LNX-DAG: @shared_var = internal global i32
+// NORDC-DAG: @shared_var = internal global i32
+// RDC-DAG: @shared_var = dso_local global i32
 // WIN-DAG: @"?shared_var@@3HA" = internal global i32
 __shared__ int shared_var;
 
@@ -87,18 +94,21 @@
 
 // external device-side variables with definitions should generate
 // definitions for the shadows.
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef
 extern __device__ int ext_device_var_def;
 __device__ int ext_device_var_def = 1;
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
 __constant__ int ext_constant_var_def = 2;
 
 #if __cplusplus > 201402L
-/// FIXME: Reject __device__ constexpr and inline variables in Sema.
-// LNX_17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
-// LNX_17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
+// RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}}
+// NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}}
 __device__ inline int inline_var = 3;
 struct C {
   __device__ static constexpr int member_inline_var = 4;
@@ -151,13 +161,13 @@
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
 // HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
 // * constant unnamed string with NVModuleID
-// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
+// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * Alias to global symbol containing the NVModuleID.
-// RDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
-// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
+// CUDARDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
+// CUDARDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
@@ -214,25 +224,33 @@
 // HIP-NEXT: icmp eq i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit
 // HIP: if:
-// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDANORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+//   .. stores return value in __[[PREFIX]]_gpubin_handle
+// CUDANORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+//   .. and then calls __[[PREFIX]]_register_globals
+// HIP: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
 //   .. stores return value in __[[PREFIX]]_gpubin_handle
-// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
 //   .. and then calls __[[PREFIX]]_register_globals
 // HIP-NEXT: br label %exit
 // HIP: exit:
 // HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
-// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// CUDANORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// HIP-NEXT: call void @__[[PREFIX]]_register_globals
 // * In separate mode we also register a destructor.
-// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// CUDANORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// HIP-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
 
 // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
-// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
-// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
-// RDC-SAME: [[MODULE_ID_GLOBAL]]
+// CUDARDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
+// CUDARDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDARDC-SAME: [[MODULE_ID_GLOBAL]]
 
 // Test that we've created destructor.
-// NORDC: define internal void @__[[PREFIX]]_module_dtor
-// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// CUDANORDC: define internal void @__[[PREFIX]]_module_dtor
+// HIP: define internal void @__[[PREFIX]]_module_dtor
+// CUDANORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP: load{{.*}}__[[PREFIX]]_gpubin_handle
 // CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
 // HIP-NEXT: icmp ne i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -4183,8 +4183,12 @@
   bool NeedsGlobalDtor =
       D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
+  bool IsHIPManagedVarOnDevice =
+      getLangOpts().CUDAIsDevice && D->hasAttr<HIPManagedAttr>();
+
   const VarDecl *InitDecl;
-  const Expr *InitExpr = D->getAnyInitializer(InitDecl);
+  const Expr *InitExpr =
+      IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl);
 
   Optional<ConstantEmitter> emitter;
 
@@ -4204,8 +4208,6 @@
       (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
        D->getType()->isCUDADeviceBuiltinTextureType() ||
        D->hasAttr<HIPManagedAttr>());
-  // HIP pinned shadow of initialized host-side global variables are also
-  // left undefined.
   if (getLangOpts().CUDA &&
       (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar))
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
@@ -4316,7 +4318,10 @@
     }
   }
 
-  GV->setInitializer(Init);
+  // HIP managed variables need to be emitted as declarations in device
+  // compilation.
+  if (!IsHIPManagedVarOnDevice)
+    GV->setInitializer(Init);
   if (emitter)
     emitter->finalize(GV);
 
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -538,6 +538,8 @@
             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
             llvm::GlobalVariable::NotThreadLocal);
+        ManagedVar->setDSOLocal(Var->isDSOLocal());
+        ManagedVar->setVisibility(Var->getVisibility());
         replaceManagedVar(Var, ManagedVar);
         llvm::Value *Args[] = {
             &GpuBinaryHandlePtr,
@@ -924,11 +926,16 @@
 
 void CGNVCUDARuntime::internalizeDeviceSideVar(
     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
-  // Host-side shadows of external declarations of device-side
-  // global variables become internal definitions. These have to
-  // be internal in order to prevent name conflicts with global
-  // host variables with the same name in a different TUs.
+  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
+  // global variables become internal definitions. These have to be internal in
+  // order to prevent name conflicts with global host variables with the same
+  // name in a different TUs.
   //
+  // For -fgpu-rdc, the shadow variables should not be internalized because
+  // they may be accessed by different TU.
+  if (CGM.getLangOpts().GPURelocatableDeviceCode)
+    return;
+
   // __shared__ variables are odd. Shadows do get created, but
   // they are not registered with the CUDA runtime, so they
   // can't really be used to access their device-side
Index: clang/lib/AST/ASTContext.cpp
===================================================================
--- clang/lib/AST/ASTContext.cpp
+++ clang/lib/AST/ASTContext.cpp
@@ -11434,16 +11434,17 @@
 }
 
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
-         ((D->hasAttr<CUDADeviceAttr>() &&
-           !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
-          (D->hasAttr<CUDAConstantAttr>() &&
-           !D->getAttr<CUDAConstantAttr>()->isImplicit()) ||
+  return ((!getLangOpts().GPURelocatableDeviceCode &&
+           ((D->hasAttr<CUDADeviceAttr>() &&
+             !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
+            (D->hasAttr<CUDAConstantAttr>() &&
+             !D->getAttr<CUDAConstantAttr>()->isImplicit()))) ||
           D->hasAttr<HIPManagedAttr>()) &&
          isa<VarDecl>(D) && cast<VarDecl>(D)->getStorageClass() == SC_Static;
 }
 
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
   return mayExternalizeStaticVar(D) &&
-         CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D));
+         (D->hasAttr<HIPManagedAttr>() ||
+          CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
 }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to