Hi eliben,

Added support for CUDA __launch_bounds__ attribute to CodeGen.
Sema does have a CUDALaunchBoundsAttr, but CodeGen was doing nothing with it.
This change translates CUDALaunchBoundsAttr to maxntidx and minctasm
metadata, which NVPTX then translates to the correct PTX directives.

http://reviews.llvm.org/D3318

Files:
  lib/CodeGen/TargetInfo.cpp
  test/CodeGenCUDA/launch-bounds.cu
Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp
+++ lib/CodeGen/TargetInfo.cpp
@@ -4764,7 +4764,8 @@
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
 private:
-  static void addKernelMetadata(llvm::Function *F);
+  static void addNVPTXMetadata(llvm::Function *F, StringRef Name,
+                               const int Operand);
 };
 
 ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType RetTy) const {
@@ -4823,7 +4824,7 @@
     // By default, all functions are device functions
     if (FD->hasAttr<OpenCLKernelAttr>()) {
       // OpenCL __kernel functions get kernel metadata
-      addKernelMetadata(F);
+      addNVPTXMetadata(F, "kernel", 1);
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
@@ -4834,28 +4835,39 @@
     // CUDA __global__ functions get a kernel metadata entry.  Since
     // __global__ functions cannot be called from the device, we do not
     // need to set the noinline attribute.
-    if (FD->hasAttr<CUDAGlobalAttr>())
-      addKernelMetadata(F);
+    if (FD->hasAttr<CUDAGlobalAttr>()) {
+      // Create !{<func-ref>, metadata !"kernel", i32 1} node
+      addNVPTXMetadata(F, "kernel", 1);
+    }
+    if (FD->hasAttr<CUDALaunchBoundsAttr>()) {
+      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
+      addNVPTXMetadata(F, "maxntidx",
+                       FD->getAttr<CUDALaunchBoundsAttr>()->getMaxThreads());
+      const int minctasm = FD->getAttr<CUDALaunchBoundsAttr>()->getMinBlocks();
+      if (minctasm > 0) {
+        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
+        addNVPTXMetadata(F, "minctasm", minctasm);
+      }
+    }
   }
 }
 
-void NVPTXTargetCodeGenInfo::addKernelMetadata(llvm::Function *F) {
+void NVPTXTargetCodeGenInfo::addNVPTXMetadata(llvm::Function *F, StringRef Name,
+                                              const int Operand) {
   llvm::Module *M = F->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
   // Get "nvvm.annotations" metadata node
   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
 
-  // Create !{<func-ref>, metadata !"kernel", i32 1} node
   llvm::SmallVector<llvm::Value *, 3> MDVals;
   MDVals.push_back(F);
-  MDVals.push_back(llvm::MDString::get(Ctx, "kernel"));
-  MDVals.push_back(llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1));
-
+  MDVals.push_back(llvm::MDString::get(Ctx, Name));
+  MDVals.push_back(
+      llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand));
   // Append metadata to nvvm.annotations
   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
 }
-
 }
 
 //===----------------------------------------------------------------------===//
Index: test/CodeGenCUDA/launch-bounds.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/launch-bounds.cu
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
+
+#include "../SemaCUDA/cuda.h"
+
+#define MAX_THREADS_PER_BLOCK 256
+#define MIN_BLOCKS_PER_MP     2
+
+// Test both max threads per block and Min cta per sm.
+extern "C" {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
+Kernel1()
+{
+}
+}
+
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"maxntidx", i32 256}
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"minctasm", i32 2}
+
+// Test only max threads per block. Min cta per sm defaults to 0, and
+// CodeGen doesn't output a zero value for minctasm.
+extern "C" {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK )
+Kernel2()
+{
+}
+}
+
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel2, metadata !"maxntidx", i32 256}
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to