Issue 172426
Summary [Clang][CUDA] Driver incorrectly passes bitcode as 'kind=elf' to fatbinary when using -S -emit-llvm
Labels clang
Assignees
Reporter cseslowpoke
    When compiling a CUDA file with `clang++ -S -emit-llvm`, the compilation fails at the `fatbinary` stage with an `elf mismatch error`.
### Reproduction
```
echo "__global__ void k(){}" > a.cu
clang++ -S -emit-llvm --cuda-gpu-arch=sm_52 a.cu
```
### Output
```
nvFatbin error: fatbinary elf mismatch: elf size doesn't match user-specified size
clang++: error: fatbinary command failed with exit code 1 (use -v to see invocation)
```
### -###
```
clang version 22.0.0git (https://github.com/llvm/llvm-project.git 941809bccc30eff7e0dc2b467d3360b27f0a50d2)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /home/phlai/Workspace/llvm-project/build/bin
Build config: +unoptimized, +assertions
clang++: warning: CUDA version 12.9 is only partially supported [-Wunknown-cuda-version]
 "/home/phlai/Workspace/llvm-project/build/bin/clang-22" "-cc1" "-triple" "nvptx64-nvidia-cuda" "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" "-emit-llvm-uselists" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "a.cu" "-mrelocation-model" "static" "-mframe-pointer=all" "-fno-rounding-math" "-no-integrated-as" "-aux-target-cpu" "x86-64" "-fcuda-is-device" "-mllvm" "-enable-memcpyopt-without-libcalls" "-fno-threadsafe-statics" "-fcuda-allow-variadic-functions" "-mlink-builtin-bitcode" "/usr/local/cuda-12.9/nvvm/libdevice/libdevice.10.bc" "-target-sdk-version=12.9" "-target-cpu" "sm_52" "-target-feature" "+ptx88" "-debugger-tuning=gdb" "-fno-dwarf-directory-asm" "-fdebug-compilation-dir=/tmp" "-resource-dir" "/home/phlai/Workspace/llvm-project/build/lib/clang/22" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include/cuda_wrappers" "-include" "__clang_cuda_runtime_wrapper.h" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/x86_64-suse-linux" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/backward" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/x86_64-suse-linux" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/backward" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../x86_64-suse-linux/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-internal-isystem" "/usr/local/cuda-12.9/include" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../x86_64-suse-linux/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-fdeprecated-macro" "-fno-autolink" "-ferror-limit" "19" "-fmessage-length=185" "--offload-new-driver" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fcxx-exceptions" "-fexceptions" "-fcolor-diagnostics" "-cuid=ff1d3311489858cf" "-fdwarf2-cfi-asm" "-o" "/tmp/a-sm_52-f20187.bc" "-x" "cuda" "a.cu"
 "/usr/local/cuda-12.9/bin/fatbinary" "-64" "--create" "/tmp/a-ae5b50.fatbin" "--image3=kind=elf,sm=52,file=/tmp/a-sm_52-f20187.bc"
 "/home/phlai/Workspace/llvm-project/build/bin/clang-22" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-target-sdk-version=12.9" "-fcuda-allow-variadic-functions" "-aux-triple" "nvptx64-nvidia-cuda" "-emit-llvm" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "a.cu" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-debugger-tuning=gdb" "-fdebug-compilation-dir=/tmp" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/home/phlai/Workspace/llvm-project/build/lib/clang/22" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include/cuda_wrappers" "-include" "__clang_cuda_runtime_wrapper.h" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/x86_64-suse-linux" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/backward" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/x86_64-suse-linux" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../include/c++/15/backward" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../x86_64-suse-linux/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-internal-isystem" "/home/phlai/Workspace/llvm-project/build/lib/clang/22/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib64/gcc/x86_64-suse-linux/15/../../../../x86_64-suse-linux/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-internal-isystem" "/usr/local/cuda-12.9/include" "-fdeprecated-macro" "-ferror-limit" "19" "-fmessage-length=185" "--offload-new-driver" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fcxx-exceptions" "-fexceptions" "-fcolor-diagnostics" "-fcuda-include-gpubinary" "/tmp/a-ae5b50.fatbin" "-cuid=ff1d3311489858cf" "-faddrsig" "-fdwarf2-cfi-asm" "-o" "a.ll" "-x" "cuda" "a.cu"
```

### Expected Behavior

Historically (and in the legacy driver), -S -emit-llvm might have dumped IR for both host and device. However, for the current driver architecture, the expected behavior should arguably be one of the following:

1. Output separate IR files: Generate LLVM IR for both Host and Device separately (similar to the legacy driver behavior).
2. Embed PTX/SASS: Compile device code fully to PTX/SASS (ignoring -emit-llvm for the device phase) so it can be embedded in the Host IR.
3. Error: Report a diagnostic error that mixed-mode IR emission is unsupported.

### Implementation Status

I am interested in working on a fix for this. However, since the desired behavior involves design choices (separate files vs. embedding), I would like to discuss the preferred solution path before proceeding.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to