Hi eliben, echristo,
Added ptxwrap utility to help incorporating PTX into host-side object file.
Device-side CUDA compilation produces a text file with PTX assembly in
it. In order for the GPU code to be usable, it must be passed to GPU
driver which would then JIT it for appropriate GPU hardware.
Currently we rely on CUDA runtime to launch kernels from the host
side. cudaLaunch() function uses host-side address of the kernel we
want to launch and expects corresponding GPU kernel to be registered
with CUDA runtime by the time kernel launch is attempted.
Before we can register kernels, we have to load GPU code which is
expected to be in 'fatbin' container.
ptxwrap takes a file with PTX assembly and encapsulates into 'fatbin'
container. If -fatbin flag is passed, it produces fatbin binary. If
-stub argument is passed (default) ptxwrap generates kernel
registration code which incorporates fatbin bits as a string, loads it
and registers all the kernels it finds in the PTX. The output can be
included into host-side compilation or can be compiled and linked with
separately.
Caveats: most fatbin parameters are currently hardcoded and were only
tested to work with CUDA-7.0 on sm_35 hardware.
http://reviews.llvm.org/D8397
Files:
tools/CMakeLists.txt
tools/ptxwrap/CMakeLists.txt
tools/ptxwrap/PtxWrap.cpp
tools/ptxwrap/PtxWrap.h
tools/ptxwrap/ptxwrap_main.cpp
EMAIL PREFERENCES
http://reviews.llvm.org/settings/panel/emailpreferences/
Index: tools/CMakeLists.txt
===================================================================
--- tools/CMakeLists.txt
+++ tools/CMakeLists.txt
@@ -15,6 +15,8 @@
add_subdirectory(clang-check)
endif()
+add_subdirectory(ptxwrap)
+
# We support checking out the clang-tools-extra repository into the 'extra'
# subdirectory. It contains tools developed as part of the Clang/LLVM project
# on top of the Clang tooling platform. We keep them in a separate repository
Index: tools/ptxwrap/CMakeLists.txt
===================================================================
--- /dev/null
+++ tools/ptxwrap/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS support)
+
+add_clang_executable(clang-ptxwrap
+ ptxwrap_main.cpp
+ PtxWrap.cpp
+ )
+
+set(CLANG_FORMAT_LIB_DEPS
+ clangBasic
+ )
+
+target_link_libraries(clang-ptxwrap
+ ${CLANG_FORMAT_LIB_DEPS}
+ )
+
+install(TARGETS clang-ptxwrap RUNTIME DESTINATION bin)
Index: tools/ptxwrap/PtxWrap.cpp
===================================================================
--- /dev/null
+++ tools/ptxwrap/PtxWrap.cpp
@@ -0,0 +1,181 @@
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "PtxWrap.h"
+
+using namespace llvm;
+
+void PtxBlob::FindKernels() {
+ size_t EntryStart = 0;
+ size_t EntryEnd = 0;
+ std::pair<StringRef, StringRef> Pair;
+ const StringRef EntryKw = ".entry";
+ // Look for all ".entry KERNEL_NAME ("
+ while (true) {
+ EntryStart = PtxText.find(EntryKw, EntryEnd);
+ if (EntryStart == StringRef::npos)
+ break;
+ EntryEnd = PtxText.find('(', EntryStart);
+ if (EntryStart == StringRef::npos)
+ break;
+ StringRef KernelName =
+ PtxText.slice(EntryStart + EntryKw.size(), EntryEnd).trim();
+ KnownKernels.insert(KernelName);
+ }
+}
+
+void PtxBlob::getKnownKernels(StringSet<> &Names) {
+ for (const auto &Kernel : KnownKernels) {
+ Names.insert(Kernel.first());
+ }
+}
+
+void PtxWrapper::CreateFatBinHeader(raw_string_ostream &FatBinStream,
+ size_t FatBinDataSize) {
+ FatBinHeader hdr;
+ hdr.DataSize = FatBinDataSize;
+ FatBinStream.write(reinterpret_cast<const char *>(&hdr), sizeof(hdr));
+}
+
+std::string PtxWrapper::BuildFatBinPtx(const PtxBlob &Ptx) {
+ std::string PtxString;
+ raw_string_ostream PtxStream(PtxString);
+ size_t DataSize = Ptx.getPtxText().size() +
+ (FatBinFileAlignment & ~(FatBinFileAlignment - 1));
+ FatBinFileHeader Header(DataSize, Ptx.getFlags());
+
+ PtxStream.write(reinterpret_cast<const char *>(&Header), sizeof(Header));
+
+ PtxStream << Ptx.getPtxText();
+ // Pad end of file with spaces up to required alignment.
+ while (PtxStream.tell() % FatBinFileAlignment) {
+ PtxStream << ' ';
+ }
+ return PtxStream.str();
+}
+
+void PtxWrapper::WritePrologue() {
+ *OS << R"XX(
+#include "fatBinaryCtl.h"
+#include <sys/types.h>
+#define __CUDA_INTERNAL_COMPILATION__
+#include <crt/host_runtime.h>
+
+extern "C" {
+
+__attribute__((constructor))
+static void __load_ptx(void) {
+)XX";
+}
+
+void PtxWrapper::WriteFatBinArray(const std::string &FatBinString) {
+ *OS << "__attribute__ ((section (\".nv_fatbin\"))) \n"
+ << "static const char __fatbin_array[] = \n\"";
+ OS->write_escaped(FatBinString);
+ *OS << "\";\n";
+ *OS << R"XX(
+ __attribute__ ((aligned (8)))
+ __attribute__ ((section (".nvFatBinSegment")))
+ static const __fatBinC_Wrapper_t __fatbin_wrapper =
+ {0x466243b1, 1, (const unsigned long long*)__fatbin_array, 0};
+ __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatbin_wrapper);
+ {
+ volatile static void **__ref __attribute__((unused));
+ __ref = (volatile void **)__cudaFatCubinHandle;
+ };
+)XX";
+}
+
+void PtxWrapper::WriteRegistrationCode() {
+ llvm::StringSet<> KernelNames;
+ for (auto &PtxBlob : InputPtx) {
+ PtxBlob.getKnownKernels(KernelNames);
+ }
+
+ int kernel_count = 0;
+ for (const auto &X : KernelNames) {
+ *OS << "extern void __kernel_launch_func" << kernel_count << "(void) asm(\""
+ << X.first() << "\");\n";
+ *OS << "static char __kernel_name" << kernel_count << "[] = \"" << X.first()
+ << "\";\n";
+ *OS << " __cudaRegisterFunction(__cudaFatCubinHandle, "
+ << "(const char *)__kernel_launch_func" << kernel_count
+ << ", __kernel_name" << kernel_count << ", \"" << X.first() << "\", "
+ << "-1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0);\n";
+ ++kernel_count;
+ }
+}
+
+void PtxWrapper::WriteEpilogue() {
+ *OS << R"XX(
+ atexit(__cudaUnregisterBinaryUtil);
+}
+
+} // extern "C"
+)XX";
+}
+
+std::string PtxWrapper::CreateFatbin() {
+ std::string FatBinString;
+ raw_string_ostream FatBinStream(FatBinString);
+ llvm::SmallVector<std::string, 2> FatBinPieces;
+ size_t FatBinDataSize = 0;
+
+ // Collect fatbin parts for each PTX blob so we know total size
+ for (auto &PtxBlob : InputPtx) {
+ FatBinPieces.push_back(BuildFatBinPtx(PtxBlob));
+ llvm::errs() << "PTX size " << FatBinPieces.back().size() << "\n";
+ FatBinDataSize += FatBinPieces.back().size();
+ }
+ CreateFatBinHeader(FatBinStream, FatBinDataSize);
+ for (std::string &FatBinPiece : FatBinPieces) {
+ FatBinStream << FatBinPiece;
+ }
+ return FatBinString;
+}
+
+void PtxWrapper::AddPtxToFatbin(std::unique_ptr<llvm::MemoryBuffer> PtxBuf) {
+ InputPtx.push_back(PtxBlob(std::move(PtxBuf)));
+ InputPtx.back().getKnownKernels(KnownKernels);
+}
+
+bool PtxWrapper::Wrap(StringRef FileName) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+ MemoryBuffer::getFileOrSTDIN(FileName);
+ if (std::error_code EC = CodeOrErr.getError()) {
+ llvm::errs() << EC.message() << "\n";
+ return true;
+ }
+
+ AddPtxToFatbin(std::move(CodeOrErr.get()));
+ return false;
+}
+
+bool PtxWrapper::Write() {
+ std::string FatBinString;
+ FatBinString = CreateFatbin();
+
+ if (Mode == GenFatbin) {
+ *OS << FatBinString;
+ } else {
+ WritePrologue();
+ WriteFatBinArray(FatBinString);
+ WriteRegistrationCode();
+ WriteEpilogue();
+ }
+ OS->flush();
+ return false;
+}
+
+void PtxWrapper::Init() {
+ std::error_code EC;
+ std::unique_ptr<raw_fd_ostream> OutFile(
+ new raw_fd_ostream(OutputFileName, EC, sys::fs::F_RW));
+ if (EC) {
+ errs() << "Error opening '" << OutputFileName << "': " << EC.message()
+ << '\n';
+ exit(1);
+ }
+ OS = std::move(OutFile);
+}
Index: tools/ptxwrap/PtxWrap.h
===================================================================
--- /dev/null
+++ tools/ptxwrap/PtxWrap.h
@@ -0,0 +1,118 @@
+#ifndef __PTXWRAP_H__
+#define __PTXWRAP_H__
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+enum WrapMode { GenFatbin, GenStub };
+
+const uint32_t FatBinMagicValue = 0xba55ed50;
+const uint16_t FatBinVersionValue = 1;
+const uint16_t FatBinFileAlignment = 8;
+
+struct FatBinHeader {
+ uint32_t Magic; // 0x00
+ uint16_t Version; // 0x04
+ uint16_t HeaderSize; // 0x06
+ uint32_t DataSize; // 0x08
+ uint32_t _unused; // 0x0c
+public:
+ FatBinHeader()
+ : Magic(FatBinMagicValue), Version(FatBinVersionValue),
+ HeaderSize(sizeof(*this)), DataSize(0), _unused(0) {}
+};
+
+enum FatBinFileKind { FatBinFilePtx = 1 };
+enum FatBinFlags {
+ AddressSize64 = 0x01,
+ HasDebugInfo = 0x02,
+ ProducerCuda = 0x04,
+ ProducerOpenCL = 0x08,
+ HostLinux = 0x10,
+ HostMac = 0x20,
+ HostWindows = 0x40,
+ Compressed = 0x200
+};
+
+struct FatBinFileHeader {
+ uint16_t Kind; // 0x00
+ uint16_t unknown02; // 0x02
+ uint32_t HeaderSize; // 0x04
+ uint32_t DataSize; // 0x08
+ uint32_t unknown0c; // 0x0c
+ uint32_t CompressedSize; // 0x10
+ uint32_t SubHeaderSize; // 0x14
+ uint16_t VersionMinor; // 0x18
+ uint16_t VersionMajor; // 0x1a
+ uint32_t CudaArch; // 0x1c
+ uint32_t unknown20; // 0x20
+ uint32_t unknown24; // 0x24
+ uint32_t Flags; // 0x28
+ uint32_t unknown2c; // 0x2c
+ uint32_t unknown30; // 0x30
+ uint32_t unknown34; // 0x34
+ uint32_t UncompressedSize; // 0x38
+ uint32_t unknown3c; // 0x3c
+ uint32_t unknown40; // 0x40
+ uint32_t unknown44; // 0x44
+ FatBinFileHeader(uint32_t _DataSize, uint32_t _Flags)
+ : Kind(FatBinFilePtx), unknown02(0x0101), HeaderSize(sizeof(*this)),
+ DataSize(_DataSize), unknown0c(0), CompressedSize(0),
+ SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4),
+ CudaArch(35), unknown20(0), unknown24(0), Flags(_Flags), unknown2c(0),
+ unknown30(0), unknown34(0), UncompressedSize(0), unknown3c(0),
+ unknown40(0), unknown44(0) {}
+};
+
+class PtxBlob {
+ std::unique_ptr<llvm::MemoryBuffer> PtxBuf;
+ llvm::StringRef PtxText;
+ llvm::StringSet<> KnownKernels;
+
+public:
+ PtxBlob(std::unique_ptr<llvm::MemoryBuffer> Ptx)
+ : PtxBuf(std::move(Ptx)), PtxText(PtxBuf->getBuffer()) {
+ FindKernels();
+ }
+ const llvm::StringRef getPtxText() const { return PtxBuf->getBuffer(); }
+ void getKnownKernels(llvm::StringSet<> &Names);
+ uint32_t getFlags() const { return AddressSize64 | ProducerCuda | HostLinux; }
+
+private:
+ void FindKernels();
+};
+
+class PtxWrapper {
+ WrapMode Mode;
+ const llvm::StringRef OutputFileName;
+ llvm::SmallVector<PtxBlob, 2> InputPtx;
+ llvm::StringSet<> KnownKernels;
+ std::unique_ptr<llvm::raw_ostream> OS;
+
+public:
+ PtxWrapper(WrapMode _Mode, llvm::StringRef _OutputFileName)
+ : Mode(_Mode), OutputFileName(_OutputFileName), OS(nullptr) {
+ Init();
+ }
+
+ ~PtxWrapper(){};
+
+ // Processes given PTX file.
+ bool Wrap(llvm::StringRef FileName);
+ bool Write();
+
+private:
+ void Init();
+ void AddPtxToFatbin(std::unique_ptr<llvm::MemoryBuffer> PtxBuf);
+ std::string CreateFatbin();
+ void WritePrologue();
+ void WriteFatBinArray(const std::string &FatBinString);
+ void WriteRegistrationCode();
+ void WriteEpilogue();
+ void CreateFatBinHeader(llvm::raw_string_ostream &FatBinStream,
+ size_t FatBinDataSize);
+ std::string BuildFatBinPtx(const PtxBlob &Ptx);
+ void UpdateFatBinHeader(llvm::raw_string_ostream &FatBinStream);
+};
+#endif
Index: tools/ptxwrap/ptxwrap_main.cpp
===================================================================
--- /dev/null
+++ tools/ptxwrap/ptxwrap_main.cpp
@@ -0,0 +1,52 @@
+#include "clang/Basic/Version.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Signals.h"
+#include "PtxWrap.h"
+
+using namespace llvm;
+
+static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden);
+static cl::opt<std::string> OutputFilename("o",
+ cl::desc("Specify output filename"),
+ cl::value_desc("filename"),
+ cl::Required);
+static cl::list<std::string> InputFilenames(cl::desc("<file> [<file> ...]"),
+ cl::Positional, cl::Required,
+ cl::OneOrMore);
+cl::opt<WrapMode> Mode(
+ cl::desc("Choose wrap mode:"),
+ cl::values(clEnumValN(GenFatbin, "fatbin", "Produce fatbin file."),
+ clEnumValN(GenStub, "stub", "Produce host-side source code."),
+ clEnumValEnd));
+
+static void PrintVersion() {
+ raw_ostream &OS = outs();
+ OS << clang::getClangToolFullVersion("clang-ptxwrap") << '\n';
+}
+
+int main(int argc, char *argv[]) {
+ llvm::sys::PrintStackTraceOnErrorSignal();
+
+ Mode = GenStub;
+
+ cl::SetVersionPrinter(PrintVersion);
+ cl::ParseCommandLineOptions(
+ argc, argv, "A tool to generate wrapper code for PTX assembly which\n"
+ "would produce includable C++ code to register kernels\n"
+ "with CUDA runtime.");
+ if (Help)
+ cl::PrintHelpMessage();
+
+ PtxWrapper Wrapper(Mode, OutputFilename);
+ bool Error = false;
+ for (const auto &Filename : InputFilenames)
+ Error |= Wrapper.Wrap(Filename);
+
+ if (!Error)
+ Error = Wrapper.Write();
+
+ return Error;
+}
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits