[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 closed 
https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/86830

>From 875ed36029851a2423c97b28bd5bf34975efb016 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Wed, 27 Mar 2024 11:43:37 -0500
Subject: [PATCH 1/2] [Offload] Change unregister library to use `atexit`
 instead of destructor

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.
---
 clang/test/Driver/linker-wrapper-image.c  |  8 +--
 .../Frontend/Offloading/OffloadWrapper.cpp| 70 ++-
 2 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-image.c 
b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..d01445e3aed04e 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,11 +26,11 @@
 //  OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant 
[[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 
8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr 
constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr 
inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, 
i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr 
@.omp_offloading.device_image, i64 1, i64 0), ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant 
%__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
 
 //  OPENMP: define internal void @.omp_offloading.descriptor_reg() section 
".text.startup" {
 // OPENMP-NEXT: entry:
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr 
@.omp_offloading.descriptor)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
@@ -62,7 +62,7 @@
 // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", 
align 8
 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
 
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
 
 //  CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" 
{
 // CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
 // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", 
align 8
 // HIP-NEXT: @.hip.binary_handle = internal global ptr null
 
-// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
 
 //  HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
 // HIP-NEXT: entry:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp 
b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index fec1bdbe9d8c74..86e8712ce95ae6 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , 

[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits


@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,
+   StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-".omp_offloading.descriptor_reg" + Suffix, );
+  auto *Func =
+  Function::Create(FuncTy, GlobalValue::InternalLinkage,
+   ".omp_offloading.descriptor_unreg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_register_lib function declaration.
-  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-  /*isVarArg*/ false);
-  FunctionCallee RegFuncC =
-  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+  // Get __tgt_unregister_lib function declaration.
+  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+/*isVarArg*/ false);
+  FunctionCallee UnRegFuncC =
+  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(RegFuncC, BinDesc);
+  Builder.CreateCall(UnRegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to constructors.
-  // Set priority to 1 so that __tgt_register_lib is executed AFTER
-  // __tgt_register_requires (we want to know what requirements have been
-  // asked for before we load a libomptarget plugin so that by the time the
-  // plugin is loaded it can report how many devices there are which can
-  // satisfy these requirements).
-  appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  return Func;
 }
 
-void createUnregisterFunction(Module , GlobalVariable *BinDesc,
-  StringRef Suffix) {
+void createRegisterFunction(Module , GlobalVariable *BinDesc,
+StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func =
-  Function::Create(FuncTy, GlobalValue::InternalLinkage,
-   ".omp_offloading.descriptor_unreg" + Suffix, );
+  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+".omp_offloading.descriptor_reg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_unregister_lib function declaration.
-  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-/*isVarArg*/ false);
-  FunctionCallee UnRegFuncC =
-  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+  // Get __tgt_register_lib function declaration.
+  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+  /*isVarArg*/ false);
+  FunctionCallee RegFuncC =
+  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+  auto *AtExitTy = FunctionType::get(
+  Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+  FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+  Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(UnRegFuncC, BinDesc);
+
+  // Register the destructors with 'atexit', This is expected by the CUDA

jhuber6 wrote:

```suggestion
  // Register the destructors with 'atexit'. This is expected by the CUDA
```

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits


@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,
+   StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-".omp_offloading.descriptor_reg" + Suffix, );
+  auto *Func =
+  Function::Create(FuncTy, GlobalValue::InternalLinkage,
+   ".omp_offloading.descriptor_unreg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_register_lib function declaration.
-  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-  /*isVarArg*/ false);
-  FunctionCallee RegFuncC =
-  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+  // Get __tgt_unregister_lib function declaration.
+  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+/*isVarArg*/ false);
+  FunctionCallee UnRegFuncC =
+  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(RegFuncC, BinDesc);
+  Builder.CreateCall(UnRegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to constructors.
-  // Set priority to 1 so that __tgt_register_lib is executed AFTER
-  // __tgt_register_requires (we want to know what requirements have been
-  // asked for before we load a libomptarget plugin so that by the time the
-  // plugin is loaded it can report how many devices there are which can
-  // satisfy these requirements).
-  appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  return Func;
 }
 
-void createUnregisterFunction(Module , GlobalVariable *BinDesc,
-  StringRef Suffix) {
+void createRegisterFunction(Module , GlobalVariable *BinDesc,
+StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func =
-  Function::Create(FuncTy, GlobalValue::InternalLinkage,
-   ".omp_offloading.descriptor_unreg" + Suffix, );
+  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+".omp_offloading.descriptor_reg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_unregister_lib function declaration.
-  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-/*isVarArg*/ false);
-  FunctionCallee UnRegFuncC =
-  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+  // Get __tgt_register_lib function declaration.
+  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+  /*isVarArg*/ false);
+  FunctionCallee RegFuncC =
+  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+  auto *AtExitTy = FunctionType::get(
+  Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+  FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+  Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(UnRegFuncC, BinDesc);
+
+  // Register the destructors with 'atexit', This is expected by the CUDA

jhuber6 wrote:

I think that's actually in this file somewhere for the CUDA wrapper portion.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Artem Belevich via cfe-commits

https://github.com/Artem-B edited 
https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Artem Belevich via cfe-commits


@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,
+   StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-".omp_offloading.descriptor_reg" + Suffix, );
+  auto *Func =
+  Function::Create(FuncTy, GlobalValue::InternalLinkage,
+   ".omp_offloading.descriptor_unreg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_register_lib function declaration.
-  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-  /*isVarArg*/ false);
-  FunctionCallee RegFuncC =
-  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+  // Get __tgt_unregister_lib function declaration.
+  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+/*isVarArg*/ false);
+  FunctionCallee UnRegFuncC =
+  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(RegFuncC, BinDesc);
+  Builder.CreateCall(UnRegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to constructors.
-  // Set priority to 1 so that __tgt_register_lib is executed AFTER
-  // __tgt_register_requires (we want to know what requirements have been
-  // asked for before we load a libomptarget plugin so that by the time the
-  // plugin is loaded it can report how many devices there are which can
-  // satisfy these requirements).
-  appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  return Func;
 }
 
-void createUnregisterFunction(Module , GlobalVariable *BinDesc,
-  StringRef Suffix) {
+void createRegisterFunction(Module , GlobalVariable *BinDesc,
+StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func =
-  Function::Create(FuncTy, GlobalValue::InternalLinkage,
-   ".omp_offloading.descriptor_unreg" + Suffix, );
+  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+".omp_offloading.descriptor_reg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_unregister_lib function declaration.
-  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-/*isVarArg*/ false);
-  FunctionCallee UnRegFuncC =
-  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+  // Get __tgt_register_lib function declaration.
+  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+  /*isVarArg*/ false);
+  FunctionCallee RegFuncC =
+  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+  auto *AtExitTy = FunctionType::get(
+  Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+  FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+  Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(UnRegFuncC, BinDesc);
+
+  // Register the destructors with 'atexit', This is expected by the CUDA

Artem-B wrote:

Typo. `,` -> `.`

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Artem Belevich via cfe-commits


@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,
+   StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-".omp_offloading.descriptor_reg" + Suffix, );
+  auto *Func =
+  Function::Create(FuncTy, GlobalValue::InternalLinkage,
+   ".omp_offloading.descriptor_unreg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_register_lib function declaration.
-  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-  /*isVarArg*/ false);
-  FunctionCallee RegFuncC =
-  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+  // Get __tgt_unregister_lib function declaration.
+  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+/*isVarArg*/ false);
+  FunctionCallee UnRegFuncC =
+  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(RegFuncC, BinDesc);
+  Builder.CreateCall(UnRegFuncC, BinDesc);
   Builder.CreateRetVoid();
 
-  // Add this function to constructors.
-  // Set priority to 1 so that __tgt_register_lib is executed AFTER
-  // __tgt_register_requires (we want to know what requirements have been
-  // asked for before we load a libomptarget plugin so that by the time the
-  // plugin is loaded it can report how many devices there are which can
-  // satisfy these requirements).
-  appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  return Func;
 }
 
-void createUnregisterFunction(Module , GlobalVariable *BinDesc,
-  StringRef Suffix) {
+void createRegisterFunction(Module , GlobalVariable *BinDesc,
+StringRef Suffix) {
   LLVMContext  = M.getContext();
   auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-  auto *Func =
-  Function::Create(FuncTy, GlobalValue::InternalLinkage,
-   ".omp_offloading.descriptor_unreg" + Suffix, );
+  auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+".omp_offloading.descriptor_reg" + Suffix, );
   Func->setSection(".text.startup");
 
-  // Get __tgt_unregister_lib function declaration.
-  auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
-/*isVarArg*/ false);
-  FunctionCallee UnRegFuncC =
-  M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+  // Get __tgt_register_lib function declaration.
+  auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
+  /*isVarArg*/ false);
+  FunctionCallee RegFuncC =
+  M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+  auto *AtExitTy = FunctionType::get(
+  Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
+  FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
+
+  Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
 
   // Construct function body
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-  Builder.CreateCall(UnRegFuncC, BinDesc);
+
+  // Register the destructors with 'atexit', This is expected by the CUDA

Artem-B wrote:

> This is expected by the CUDA runtime

I'd add a reference to clang/lib/CodeGen/CGCUDANV.cpp which provides some 
history why we switched to `atexit`.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Artem Belevich via cfe-commits

https://github.com/Artem-B approved this pull request.


https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

Fixed, I neglected the fact that OpenMP registers more destructors inside of 
the constructor itself. Passes all the tests now.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/86830

>From 875ed36029851a2423c97b28bd5bf34975efb016 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Wed, 27 Mar 2024 11:43:37 -0500
Subject: [PATCH] [Offload] Change unregister library to use `atexit` instead
 of destructor

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.
---
 clang/test/Driver/linker-wrapper-image.c  |  8 +--
 .../Frontend/Offloading/OffloadWrapper.cpp| 70 ++-
 2 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-image.c 
b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..d01445e3aed04e 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,11 +26,11 @@
 //  OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant 
[[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 
8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr 
constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr 
inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, 
i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr 
@.omp_offloading.device_image, i64 1, i64 0), ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant 
%__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
 
 //  OPENMP: define internal void @.omp_offloading.descriptor_reg() section 
".text.startup" {
 // OPENMP-NEXT: entry:
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr 
@.omp_offloading.descriptor)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
@@ -62,7 +62,7 @@
 // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", 
align 8
 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
 
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
 
 //  CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" 
{
 // CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
 // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", 
align 8
 // HIP-NEXT: @.hip.binary_handle = internal global ptr null
 
-// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
 
 //  HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
 // HIP-NEXT: entry:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp 
b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index fec1bdbe9d8c74..86e8712ce95ae6 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , 

[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

jhuber6 wrote:

So, looking into `libomptarget` when this is applied is doing something weird. 
`atexit` is functionally a stack, so that means the first in is the last out. 
However, it seems that the static global constructor created inside of the CUDA 
plugin is being unregistered *after* this one for unknown reasons. Will need to 
look into that.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits


@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,

jhuber6 wrote:

No, since I need to call this function from `createRegisterFunction` now. I 
could forward declare it but I don't think there's a point given it's inside an 
anonymous namespace.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Yaxun Liu via cfe-commits


@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,

yxsamliu wrote:

It seems the order of createRegisterFunction and createUnregisterFunction is 
swapped. This causes some artificial differences. Is it OK to keep their 
original order.

https://github.com/llvm/llvm-project/pull/86830
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Joseph Huber (jhuber6)


Changes

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.


---
Full diff: https://github.com/llvm/llvm-project/pull/86830.diff


2 Files Affected:

- (modified) clang/test/Driver/linker-wrapper-image.c (+4-4) 
- (modified) llvm/lib/Frontend/Offloading/OffloadWrapper.cpp (+35-33) 


``diff
diff --git a/clang/test/Driver/linker-wrapper-image.c 
b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..5d5d62805e174d 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,12 +26,12 @@
 //  OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant 
[[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 
8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr 
constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr 
inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, 
i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr 
@.omp_offloading.device_image, i64 1, i64 0), ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant 
%__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
 
 //  OPENMP: define internal void @.omp_offloading.descriptor_reg() section 
".text.startup" {
 // OPENMP-NEXT: entry:
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr 
@.omp_offloading.descriptor)
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
 
@@ -62,7 +62,7 @@
 // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", 
align 8
 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
 
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
 
 //  CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" 
{
 // CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
 // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", 
align 8
 // HIP-NEXT: @.hip.binary_handle = internal global ptr null
 
-// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
+// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
 
 //  HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
 // HIP-NEXT: entry:
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp 
b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index fec1bdbe9d8c74..4f9494d2143fce 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -186,57 +186,60 @@ GlobalVariable *createBinDesc(Module , 
ArrayRef> Bufs,
 ".omp_offloading.descriptor" + Suffix);
 }
 
-void createRegisterFunction(Module , GlobalVariable *BinDesc,
-StringRef Suffix) {
+Function *createUnregisterFunction(Module , GlobalVariable *BinDesc,
+  

[clang] [llvm] [Offload] Change unregister library to use `atexit` instead of destructor (PR #86830)

2024-03-27 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 created 
https://github.com/llvm/llvm-project/pull/86830

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.


>From 1583db25c7a24e88fc3439820538ff2ef8e24429 Mon Sep 17 00:00:00 2001
From: Joseph Huber 
Date: Wed, 27 Mar 2024 11:43:37 -0500
Subject: [PATCH] [Offload] Change unregister library to use `atexit` instead
 of destructor

Summary:
The 'new driver' sets up the lifetime of a registered liftime using
global constructors and destructors. Currently, this is put at priority
1 which isn't strictly conformant as it will conflict with system
utilities. We now use 101 as this is the loweest suggested for
non-system constructors and will still run before user constructors.

Secondly, there were issues with the CUDA runtime when destructed with a
global destructor. Because the global ones are in any order and
potentially run before other things we were hitting an edge case where
the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This
would result in us erroring when we call into a destroyed `libcuda.so`
instance. using `atexit` is what CUDA / HIP use and it prevents this
from happening. Most everything uses `atexit` except system utilities
and because of the constructor priority it will be unregistered *after*
everything else but not after `_fl_fini`.
---
 clang/test/Driver/linker-wrapper-image.c  |  8 +--
 .../Frontend/Offloading/OffloadWrapper.cpp| 68 ++-
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-image.c 
b/clang/test/Driver/linker-wrapper-image.c
index 75475264135224..5d5d62805e174d 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -26,12 +26,12 @@
 //  OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant 
[[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 
8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr 
constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr 
inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, 
i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr 
@.omp_offloading.device_image, i64 1, i64 0), ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant 
%__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr 
@__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
-// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
-// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
+// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] 
[{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
 
 //  OPENMP: define internal void @.omp_offloading.descriptor_reg() section 
".text.startup" {
 // OPENMP-NEXT: entry:
 // OPENMP-NEXT:   call void @__tgt_register_lib(ptr 
@.omp_offloading.descriptor)
+// OPENMP-NEXT:   %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
 // OPENMP-NEXT:   ret void
 // OPENMP-NEXT: }
 
@@ -62,7 +62,7 @@
 // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", 
align 8
 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
 
-// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
+// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, 
ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
 
 //  CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" 
{
 // CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
 // HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 
1212764230, i32 1, ptr