[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-20 Thread Jonas Hahnfeld via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC330425: [CUDA] Register relocatable GPU binaries (authored 
by Hahnfeld, committed by ).

Repository:
  rC Clang

https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -1,33 +1,40 @@
 // RUN: echo "GPU binary would be here" > %t
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
+// RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// CHECK-DAG: @device_var = internal global i32
+// ALL-DAG: @device_var = internal global i32
 __device__ int device_var;
 
-// CHECK-DAG: @constant_var = internal global i32
+// ALL-DAG: @constant_var = internal global i32
 __constant__ int constant_var;
 
-// CHECK-DAG: @shared_var = internal global i32
+// ALL-DAG: @shared_var = internal global i32
 __shared__ int shared_var;
 
 // Make sure host globals don't get internalized...
-// CHECK-DAG: @host_var = global i32
+// ALL-DAG: @host_var = global i32
 int host_var;
 // ... and that extern vars remain external.
-// CHECK-DAG: @ext_host_var = external global i32
+// ALL-DAG: @ext_host_var = external global i32
 extern int ext_host_var;
 
 // Shadows for external device-side variables are *definitions* of
 // those variables.
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __device__ int ext_device_var;
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __constant__ int ext_constant_var;
 
 void use_pointers() {
@@ -43,59 +50,73 @@
 
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
-// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
+// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// CHECK-SAME: section ".nv_fatbin", align 8
+// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
+// NORDC-SAME: section ".nv_fatbin", align 8
+// RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
-// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
-// CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// CHECK-SAME: section ".nvFatBinSegment"
+// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
+// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
+// ALL-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// CHECK: @__cuda_gpubin_handle = internal global i8** null
-// * Make sure our constructor/destructor was added to global ctor/dtor list.
-// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// NORDC: @__cuda_gpubin_handle = internal global i8** null
+// * constant unnamed string with NVModuleID
+// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
+// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// * Make sure our constructor was added to global ctor list.
+// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+// * In separate mode we also register a destructor.
+// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * Alias to global symbol containing the NVModuleID.
+// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
+// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
 
-// CHECK: define{{.*}}kernelfunc
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaLaunch
+// 

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld updated this revision to Diff 143145.
Hahnfeld marked an inline comment as done.
Hahnfeld added a comment.

Move module ID to corresponding `else` branch.


https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -1,33 +1,40 @@
 // RUN: echo "GPU binary would be here" > %t
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
+// RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// CHECK-DAG: @device_var = internal global i32
+// ALL-DAG: @device_var = internal global i32
 __device__ int device_var;
 
-// CHECK-DAG: @constant_var = internal global i32
+// ALL-DAG: @constant_var = internal global i32
 __constant__ int constant_var;
 
-// CHECK-DAG: @shared_var = internal global i32
+// ALL-DAG: @shared_var = internal global i32
 __shared__ int shared_var;
 
 // Make sure host globals don't get internalized...
-// CHECK-DAG: @host_var = global i32
+// ALL-DAG: @host_var = global i32
 int host_var;
 // ... and that extern vars remain external.
-// CHECK-DAG: @ext_host_var = external global i32
+// ALL-DAG: @ext_host_var = external global i32
 extern int ext_host_var;
 
 // Shadows for external device-side variables are *definitions* of
 // those variables.
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __device__ int ext_device_var;
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __constant__ int ext_constant_var;
 
 void use_pointers() {
@@ -43,59 +50,73 @@
 
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
-// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
+// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// CHECK-SAME: section ".nv_fatbin", align 8
+// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
+// NORDC-SAME: section ".nv_fatbin", align 8
+// RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
-// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
-// CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// CHECK-SAME: section ".nvFatBinSegment"
+// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
+// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
+// ALL-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// CHECK: @__cuda_gpubin_handle = internal global i8** null
-// * Make sure our constructor/destructor was added to global ctor/dtor list.
-// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// NORDC: @__cuda_gpubin_handle = internal global i8** null
+// * constant unnamed string with NVModuleID
+// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
+// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// * Make sure our constructor was added to global ctor list.
+// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+// * In separate mode we also register a destructor.
+// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * Alias to global symbol containing the NVModuleID.
+// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
+// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
 
-// CHECK: define{{.*}}kernelfunc
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaLaunch
+// ALL: define{{.*}}kernelfunc
+// ALL: 

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:364-377
+  llvm::Constant *NVModuleIDConstant;
+  SmallString<64> NVModuleID;
+  if (RelocatableDeviceCode) {
+// Generate a unique module ID.
+llvm::raw_svector_ostream OS(NVModuleID);
+OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
+NVModuleIDConstant =

This can all be folded into the 'else' branch of the 'if' below.


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:283-285
+  llvm::FunctionType *RegisterGlobalsFnTy;
+  llvm::FunctionType *RegisterLinkedBinaryFnTy;
+  llvm::Function *DummyCallback;

tra wrote:
> Instead of tracking these through the conditionals of pretty long function, 
> could we make these pointers class fields and init them in the constructor 
> and make accessors return them and, possibly, assert that they are used if 
> RDC is enabled?
I've removed the caching entirely because that's already done by 
`llvm::FunctionType::get()`. These are now called in new methods to avoid 
duplication.


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld updated this revision to Diff 143136.
Hahnfeld marked 2 inline comments as done.
Hahnfeld added a comment.

Move `FunctionType`s to methods and change test prefixes.


https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -1,33 +1,40 @@
 // RUN: echo "GPU binary would be here" > %t
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
+// RUN:   | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// CHECK-DAG: @device_var = internal global i32
+// ALL-DAG: @device_var = internal global i32
 __device__ int device_var;
 
-// CHECK-DAG: @constant_var = internal global i32
+// ALL-DAG: @constant_var = internal global i32
 __constant__ int constant_var;
 
-// CHECK-DAG: @shared_var = internal global i32
+// ALL-DAG: @shared_var = internal global i32
 __shared__ int shared_var;
 
 // Make sure host globals don't get internalized...
-// CHECK-DAG: @host_var = global i32
+// ALL-DAG: @host_var = global i32
 int host_var;
 // ... and that extern vars remain external.
-// CHECK-DAG: @ext_host_var = external global i32
+// ALL-DAG: @ext_host_var = external global i32
 extern int ext_host_var;
 
 // Shadows for external device-side variables are *definitions* of
 // those variables.
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __device__ int ext_device_var;
-// CHECK-DAG: @ext_device_var = internal global i32
+// ALL-DAG: @ext_device_var = internal global i32
 extern __constant__ int ext_constant_var;
 
 void use_pointers() {
@@ -43,59 +50,73 @@
 
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
-// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
+// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// CHECK-SAME: section ".nv_fatbin", align 8
+// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
+// NORDC-SAME: section ".nv_fatbin", align 8
+// RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
-// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
-// CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// CHECK-SAME: section ".nvFatBinSegment"
+// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
+// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
+// ALL-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// CHECK: @__cuda_gpubin_handle = internal global i8** null
-// * Make sure our constructor/destructor was added to global ctor/dtor list.
-// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// NORDC: @__cuda_gpubin_handle = internal global i8** null
+// * constant unnamed string with NVModuleID
+// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
+// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// * Make sure our constructor was added to global ctor list.
+// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+// * In separate mode we also register a destructor.
+// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * Alias to global symbol containing the NVModuleID.
+// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
+// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
 
-// CHECK: define{{.*}}kernelfunc
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaSetupArgument
-// CHECK: call{{.*}}cudaLaunch
+// ALL: define{{.*}}kernelfunc

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:283-285
+  llvm::FunctionType *RegisterGlobalsFnTy;
+  llvm::FunctionType *RegisterLinkedBinaryFnTy;
+  llvm::Function *DummyCallback;

Instead of tracking these through the conditionals of pretty long function, 
could we make these pointers class fields and init them in the constructor and 
make accessors return them and, possibly, assert that they are used if RDC is 
enabled?



Comment at: test/CodeGenCUDA/device-stub.cu:3-7
+// RUN:   | FileCheck %s --check-prefixes=CHECK,DEFAULT
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s 
-fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-rdc 
-fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RDC

Labels could be a bit more descriptive:
CHECK -> ALL
DEFAULT -> NORDC

Long RUN lines could use some re-wrapping.


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-19 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld added a comment.

Ping


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-09 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld updated this revision to Diff 141698.
Hahnfeld added a comment.

Correct test check prefix.


https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -1,7 +1,10 @@
 // RUN: echo "GPU binary would be here" > %t
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,DEFAULT
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
@@ -46,16 +49,24 @@
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
 // CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// CHECK-SAME: section ".nv_fatbin", align 8
+// DEFAULT-SAME: section ".nv_fatbin", align 8
+// RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
 // CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
 // CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
 // CHECK-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// CHECK: @__cuda_gpubin_handle = internal global i8** null
-// * Make sure our constructor/destructor was added to global ctor/dtor list.
+// DEFAULT: @__cuda_gpubin_handle = internal global i8** null
+// * constant unnamed string with NVModuleID
+// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
+// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// * Make sure our constructor was added to global ctor list.
 // CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * In separate mode we also register a destructor.
+// DEFAULT: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * Alias to global symbol containing the NVModuleID.
+// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
+// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
@@ -83,19 +94,25 @@
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
 // CHECK: ret void
 
-// Test that we've built constructor..
+// Test that we've built a constructor.
 // CHECK: define internal void @__cuda_module_ctor
-//   .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
-// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
+
+// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
+// DEFAULT: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
-// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
+// DEFAULT-NEXT: store{{.*}}__cuda_gpubin_handle
 //   .. and then calls __cuda_register_globals
-// CHECK-NEXT: call void @__cuda_register_globals
+// DEFAULT-NEXT: call void @__cuda_register_globals
+
+// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID%
+// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]](
+// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper
+// RDC-SAME: [[MODULE_ID_GLOBAL]]
 
 // Test that we've created destructor.
-// CHECK: define internal void @__cuda_module_dtor
-// CHECK: load{{.*}}__cuda_gpubin_handle
-// CHECK-NEXT: call void @__cudaUnregisterFatBinary
+// DEFAULT: define internal void @__cuda_module_dtor
+// DEFAULT: load{{.*}}__cuda_gpubin_handle
+// DEFAULT-NEXT: call void @__cudaUnregisterFatBinary
 
 // There should be no __cuda_register_globals if we have no
 // device-side globals, but we still need to register GPU binary.
Index: lib/CodeGen/CGCUDANV.cpp
===
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,12 +15,13 @@
 #include "CGCUDARuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "clang/AST/Decl.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Format.h"
 
 using namespace clang;
 using 

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-04-09 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld updated this revision to Diff 141685.
Hahnfeld added a comment.

Sorry for the long delay. This update rebases the patch against current trunk 
and adapts the regression test.


https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp
  test/CodeGenCUDA/device-stub.cu

Index: test/CodeGenCUDA/device-stub.cu
===
--- test/CodeGenCUDA/device-stub.cu
+++ test/CodeGenCUDA/device-stub.cu
@@ -1,7 +1,10 @@
 // RUN: echo "GPU binary would be here" > %t
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,SEPARATE
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
@@ -46,16 +49,24 @@
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
 // CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// CHECK-SAME: section ".nv_fatbin", align 8
+// SEPARATE-SAME: section ".nv_fatbin", align 8
+// RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
 // CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
 // CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
 // CHECK-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// CHECK: @__cuda_gpubin_handle = internal global i8** null
-// * Make sure our constructor/destructor was added to global ctor/dtor list.
+// SEPARATE: @__cuda_gpubin_handle = internal global i8** null
+// * constant unnamed string with NVModuleID
+// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
+// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// * Make sure our constructor was added to global ctor list.
 // CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
-// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * In separate mode we also register a destructor.
+// SEPARATE: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// * Alias to global symbol containing the NVModuleID.
+// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
+// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
@@ -83,19 +94,25 @@
 // CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
 // CHECK: ret void
 
-// Test that we've built constructor..
+// Test that we've built a constructor.
 // CHECK: define internal void @__cuda_module_ctor
-//   .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
-// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
+
+// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
+// SEPARATE: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
-// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
+// SEPARATE-NEXT: store{{.*}}__cuda_gpubin_handle
 //   .. and then calls __cuda_register_globals
-// CHECK-NEXT: call void @__cuda_register_globals
+// SEPARATE-NEXT: call void @__cuda_register_globals
+
+// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID%
+// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]](
+// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper
+// RDC-SAME: [[MODULE_ID_GLOBAL]]
 
 // Test that we've created destructor.
-// CHECK: define internal void @__cuda_module_dtor
-// CHECK: load{{.*}}__cuda_gpubin_handle
-// CHECK-NEXT: call void @__cudaUnregisterFatBinary
+// SEPARATE: define internal void @__cuda_module_dtor
+// SEPARATE: load{{.*}}__cuda_gpubin_handle
+// SEPARATE-NEXT: call void @__cudaUnregisterFatBinary
 
 // There should be no __cuda_register_globals if we have no
 // device-side globals, but we still need to register GPU binary.
Index: lib/CodeGen/CGCUDANV.cpp
===
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,12 +15,13 @@
 #include "CGCUDARuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "clang/AST/Decl.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include 

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-16 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld marked an inline comment as done.
Hahnfeld added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:330-331
   // the GPU side.
   for (const std::string  :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
 llvm::ErrorOr GpuBinaryOrErr =

tra wrote:
> Hahnfeld wrote:
> > tra wrote:
> > > Hahnfeld wrote:
> > > > Can we actually have multiple GPU binaries here? If yes, how do I get 
> > > > there?
> > > Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile 
> > > for sm_35 and sm_50 and then will pass the names of GPU-side objects to 
> > > the host compilation via `-fcuda-include-gpubinary`.
> > I'm not sure if that's true anymore: I think they are now combined by 
> > `fatbinary`. This seems to be confirmed by `test/Driver/cuda-options.cu`. 
> > If that was the only use case, we may try to get rid of this possibility, 
> > let me check this.
> You are correct. All GPU binaries are in the single fatbin now.
> That said, you could still pass extra -fcuda-include-gpubinary to cc1 
> manually, but I see no practical reason to do it -- single fatbin serves the 
> purpose better.
> 
> We should remove this loop and make 
> CGM.getCodeGenOpts().CudaGpuBinaryFileNames a scalar.
> 
Ok, I'll work on this as a preparation patch and rebase this on top. That 
actually explains why my changes have always been working even though it didn't 
handle the loop correctly :-)


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-16 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:330-331
   // the GPU side.
   for (const std::string  :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
 llvm::ErrorOr GpuBinaryOrErr =

Hahnfeld wrote:
> tra wrote:
> > Hahnfeld wrote:
> > > Can we actually have multiple GPU binaries here? If yes, how do I get 
> > > there?
> > Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile 
> > for sm_35 and sm_50 and then will pass the names of GPU-side objects to the 
> > host compilation via `-fcuda-include-gpubinary`.
> I'm not sure if that's true anymore: I think they are now combined by 
> `fatbinary`. This seems to be confirmed by `test/Driver/cuda-options.cu`. If 
> that was the only use case, we may try to get rid of this possibility, let me 
> check this.
You are correct. All GPU binaries are in the single fatbin now.
That said, you could still pass extra -fcuda-include-gpubinary to cc1 manually, 
but I see no practical reason to do it -- single fatbin serves the purpose 
better.

We should remove this loop and make CGM.getCodeGenOpts().CudaGpuBinaryFileNames 
a scalar.



https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-16 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:330-331
   // the GPU side.
   for (const std::string  :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
 llvm::ErrorOr GpuBinaryOrErr =

tra wrote:
> Hahnfeld wrote:
> > Can we actually have multiple GPU binaries here? If yes, how do I get there?
> Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile for 
> sm_35 and sm_50 and then will pass the names of GPU-side objects to the host 
> compilation via `-fcuda-include-gpubinary`.
I'm not sure if that's true anymore: I think they are now combined by 
`fatbinary`. This seems to be confirmed by `test/Driver/cuda-options.cu`. If 
that was the only use case, we may try to get rid of this possibility, let me 
check this.


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-16 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: lib/CodeGen/CGCUDANV.cpp:330-331
   // the GPU side.
   for (const std::string  :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
 llvm::ErrorOr GpuBinaryOrErr =

Hahnfeld wrote:
> Can we actually have multiple GPU binaries here? If yes, how do I get there?
Yes. `clang --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_50...` will compile for 
sm_35 and sm_50 and then will pass the names of GPU-side objects to the host 
compilation via `-fcuda-include-gpubinary`.


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-12 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld planned changes to this revision.
Hahnfeld added a comment.

Still no regression tests.

I did some functional tests though (https://reviews.llvm.org/F5822023): With 
this patch Clang can generate valid object files with relocatable device code. 
For linking I still defer to `nvcc` and I'm not sure if I'm interested in 
reverse-engineering the needed tools to make this fully work with Clang's 
Driver: I think the biggest advantage of CUDA in Clang is using LLVM's CodeGen. 
Note that (in my simple tests) Clang's object files had enough compatibility to 
mix them with other objects generated by `nvcc` (see `Makefile.mixed`)!


https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-12 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld updated this revision to Diff 133831.
Hahnfeld added a comment.

Rebase and fix `Debug` build.


https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp

Index: lib/CodeGen/CGCUDANV.cpp
===
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,12 +15,13 @@
 #include "CGCUDARuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "clang/AST/Decl.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Format.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -45,9 +46,12 @@
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::SmallVector GpuBinaryHandles;
+  /// Whether we generate relocatable device code.
+  bool RelocatableDeviceCode;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
+  llvm::FunctionType *getRegisterGlobalsFnTy() const;
 
   /// Creates a function to register all kernel stubs generated in this module.
   llvm::Function *makeRegisterGlobalsFn();
@@ -71,7 +75,23 @@
 
 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
 ConstStr.getPointer(), Zeros);
- }
+  }
+
+  /// Helper function that generates an empty dummy function returning void.
+  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
+assert(FnTy->getReturnType()->isVoidTy() &&
+   "Can only generate dummy functions returning void!");
+llvm::Function *DummyFunc = llvm::Function::Create(
+FnTy, llvm::GlobalValue::InternalLinkage, "dummy", );
+
+llvm::BasicBlock *DummyBlock =
+llvm::BasicBlock::Create(Context, "", DummyFunc);
+CGBuilderTy FuncBuilder(CGM, Context);
+FuncBuilder.SetInsertPoint(DummyBlock);
+FuncBuilder.CreateRetVoid();
+
+return DummyFunc;
+  }
 
   void emitDeviceStubBody(CodeGenFunction , FunctionArgList );
 
@@ -93,7 +113,8 @@
 
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule )
 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
-  TheModule(CGM.getModule()) {
+  TheModule(CGM.getModule()),
+  RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
   CodeGen::CodeGenTypes  = CGM.getTypes();
   ASTContext  = CGM.getContext();
 
@@ -161,6 +182,10 @@
   CGF.EmitBlock(EndBlock);
 }
 
+llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
+  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
+}
+
 /// Creates a function that sets up state on the host side for CUDA objects that
 /// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
@@ -181,8 +206,8 @@
 return nullptr;
 
   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
-  llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-  llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", );
+  getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
+  "__cuda_register_globals", );
   llvm::BasicBlock *EntryBB =
   llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
   CGBuilderTy Builder(CGM, Context);
@@ -257,8 +282,29 @@
   if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
 return nullptr;
 
+  llvm::FunctionType *RegisterGlobalsFnTy;
+  llvm::FunctionType *RegisterLinkedBinaryFnTy;
+  llvm::Function *DummyCallback;
+  if (RelocatableDeviceCode) {
+RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
+
+auto CallbackFnTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+DummyCallback = makeDummyFunction(CallbackFnTy);
+
+// void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+// void *, void (*)(void **))
+llvm::Type *Params[] = {RegisterGlobalsFnTy, VoidPtrTy, VoidPtrTy,
+CallbackFnTy};
+RegisterLinkedBinaryFnTy = llvm::FunctionType::get(VoidTy, Params, false);
+  }
+
   // void __cuda_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
+  // We always need a function to pass in as callback. Create a dummy
+  // implementation if we don't need to register anything.
+  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
+RegisterGlobalsFunc = makeDummyFunction(RegisterGlobalsFnTy);
+
   // void ** __cudaRegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
   llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@@ -291,11 +337,18 @@
   continue;
 }
 
-const char *FatbinConstantName =
-CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+const char *FatbinConstantName;
+if 

[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-05 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld planned changes to this revision.
Hahnfeld added a comment.

I didn't write tests for this yet, but I wanted to get some early feedback on 
this and show what I have in mind.




Comment at: lib/CodeGen/CGCUDANV.cpp:330-331
   // the GPU side.
   for (const std::string  :
CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
 llvm::ErrorOr GpuBinaryOrErr =

Can we actually have multiple GPU binaries here? If yes, how do I get there?



Comment at: lib/CodeGen/CGCUDANV.cpp:342-343
+if (RelocatableDeviceCode)
+  // TODO: Figure out how this is called on mac OS!
+  FatbinConstantName = "__nv_relfatbin";
+else

@jlebar Could yo help me here as I don't have a Mac? I'd guess it's 
`__NV_CUDA,__nv_relfatbin` but I'd feel better if I can get a confirmation...



Comment at: lib/CodeGen/CGCUDANV.cpp:350-351
 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+// TODO: Figure out how this is called on mac OS!
+const char *NVModuleIDSectionName = "__nv_module_id";
 

@jlebar The same here, probably `__NV_CUDA,__nv_module_id`?


Repository:
  rC Clang

https://reviews.llvm.org/D42922



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42922: [CUDA] Register relocatable GPU binaries

2018-02-05 Thread Jonas Hahnfeld via Phabricator via cfe-commits
Hahnfeld created this revision.
Hahnfeld added reviewers: jlebar, tra, hfinkel.
Herald added a subscriber: cfe-commits.

nvcc generates a unique registration function for each object file
that contains relocatable device code. Unique names are achieved
with a module id that is also reflected in the function's name.


Repository:
  rC Clang

https://reviews.llvm.org/D42922

Files:
  lib/CodeGen/CGCUDANV.cpp

Index: lib/CodeGen/CGCUDANV.cpp
===
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,12 +15,13 @@
 #include "CGCUDARuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "clang/AST/Decl.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Format.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -45,9 +46,12 @@
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::SmallVector GpuBinaryHandles;
+  /// Whether we generate relocatable device code.
+  bool RelocatableDeviceCode;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
+  llvm::FunctionType *getRegisterGlobalsFnTy() const;
 
   /// Creates a function to register all kernel stubs generated in this module.
   llvm::Function *makeRegisterGlobalsFn();
@@ -71,7 +75,23 @@
 
 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
 ConstStr.getPointer(), Zeros);
- }
+  }
+
+  /// Helper function that generates an empty dummy function returning void.
+  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
+assert(FnTy->getReturnType()->isVoidType() &&
+   "Can only generate dummy functions returning void!");
+llvm::Function *DummyFunc = llvm::Function::Create(
+FnTy, llvm::GlobalValue::InternalLinkage, "dummy", );
+
+llvm::BasicBlock *DummyBlock =
+llvm::BasicBlock::Create(Context, "", DummyFunc);
+CGBuilderTy FuncBuilder(CGM, Context);
+FuncBuilder.SetInsertPoint(DummyBlock);
+FuncBuilder.CreateRetVoid();
+
+return DummyFunc;
+  }
 
   void emitDeviceStubBody(CodeGenFunction , FunctionArgList );
 
@@ -93,7 +113,8 @@
 
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule )
 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
-  TheModule(CGM.getModule()) {
+  TheModule(CGM.getModule()),
+  RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
   CodeGen::CodeGenTypes  = CGM.getTypes();
   ASTContext  = CGM.getContext();
 
@@ -161,6 +182,10 @@
   CGF.EmitBlock(EndBlock);
 }
 
+llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
+  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
+}
+
 /// Creates a function that sets up state on the host side for CUDA objects that
 /// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
@@ -181,8 +206,8 @@
 return nullptr;
 
   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
-  llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-  llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", );
+  getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
+  "__cuda_register_globals", );
   llvm::BasicBlock *EntryBB =
   llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
   CGBuilderTy Builder(CGM, Context);
@@ -257,8 +282,29 @@
   if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
 return nullptr;
 
+  llvm::FunctionType *RegisterGlobalsFnTy;
+  llvm::FunctionType *RegisterLinkedBinaryFnTy;
+  llvm::Function *DummyCallback;
+  if (RelocatableDeviceCode) {
+RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
+
+auto CallbackFnTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+DummyCallback = makeDummyFunction(CallbackFnTy);
+
+// void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+// void *, void (*)(void **))
+llvm::Type *Params[] = {RegisterGlobalsFnTy, VoidPtrTy, VoidPtrTy,
+CallbackFnTy};
+RegisterLinkedBinaryFnTy = llvm::FunctionType::get(VoidTy, Params, false);
+  }
+
   // void __cuda_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
+  // We always need a function to pass in as callback. Create a dummy
+  // implementation if we don't need to register anything.
+  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
+RegisterGlobalsFunc = makeDummyFunction(RegisterGlobalsFnTy);
+
   // void ** __cudaRegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
   llvm::FunctionType::get(VoidPtrPtrTy,