Issue |
128853
|
Summary |
NVPTX produces incorrect PTX with 8-bit integer vector input which results in wrong answer
|
Labels |
new issue
|
Assignees |
|
Reporter |
rashedmyt
|
This issue is created as suggested in https://github.com/llvm/llvm-project/issues/107219.
https://cuda.godbolt.org/z/1ebcMfv87 highlights the differences in the PTX generated for the reproducer IR given below b/w LLVM 16 and current tip of main branch.
<details>
<summary>Original LLVM IR</summary>
```llvm
; LLVM version:16.0.6
; ModuleID = 'module'
source_filename = "module"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define void @KERNEL(ptr addrspace(1) align 8 %out0, <2 x i8> %in0, i32 %arraySize) addrspace(1) {
check:
%tid_x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ntid_x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%ctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%ctaid_y = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
%nctaid_x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%0 = mul i32 %nctaid_x, %ctaid_y
%1 = add i32 %0, %ctaid_x
%2 = mul i32 %1, %ntid_x
%linearTid = add i32 %2, %tid_x
%3 = icmp ult i32 %linearTid, %arraySize
%4 = extractelement <2 x i8> %in0, i64 0
%5 = sitofp i8 %4 to double
%6 = extractelement <2 x i8> %in0, i64 1
%7 = sitofp i8 %6 to double
%8 = insertelement <2 x double> undef, double %5, i64 0
%9 = insertelement <2 x double> %8, double %7, i64 1
br i1 %3, label %StraightLineCodeBlock, label %exit
exit: ; preds = %StraightLineCodeBlock, %check
ret void
StraightLineCodeBlock: ; preds = %check
store <2 x double> %9, ptr addrspace(1) %out0, align 16
br label %exit
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!nvvm.annotations = !{!0}
!0 = !{ptr addrspace(1) @KERNEL, !"kernel", i32 1}
```
</details>
<details>
<summary>C++ Driver Code</summary>
```cpp
// Type your code here, or load an example.
#include <array>
#include <complex>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <cuda.h>
inline void checkCudaErrors(CUresult err) {
if (err != CUDA_SUCCESS)
std::cout << std::string("CUDA Error: ") + std::to_string(err) << std::endl;
}
#define ALIGN_UP(offset, alignment) \
(offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
#define ADD_TO_PARAM_BUFFER(value, alignment) \
do { \
ALIGN_UP(paramBufferSize, alignment); \
memcpy(paramBuffer + paramBufferSize, &(value), sizeof(value)); \
paramBufferSize += sizeof(value); \
} while (0)
template <typename T, typename std::enable_if<std::is_pointer<T>::value,
T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
char *paramBuffer, size_t ¶mBufferSize) {
checkCudaErrors(cuMemAlloc(&devPtr, sizeof(*input) * numel));
checkCudaErrors(cuMemcpyHtoD(devPtr, input, sizeof(*input) * numel));
ADD_TO_PARAM_BUFFER(devPtr, __alignof(devPtr));
}
template <typename T, typename std::enable_if<!std::is_pointer<T>::value,
T>::type * = nullptr>
void ProcessInput(CUdeviceptr &devPtr, T input, int const numel,
char *paramBuffer, size_t ¶mBufferSize) {
ADD_TO_PARAM_BUFFER(input, __alignof(input));
}
template <typename T>
CUdeviceptr ProcessEachInput(T input, int const numel, char *paramBuffer,
size_t ¶mBufferSize) {
CUdeviceptr devPtr(0LL);
ProcessInput<T>(devPtr, input, numel, paramBuffer, paramBufferSize);
return devPtr;
}
// Forward declare
template <typename OutputType, typename... InputTypes>
int RunPtx(std::string const &ptx, int const numel, int deviceID,
std::vector<OutputType *> const &outputPtrs, InputTypes... inputs);
/*
* Overload to convert single outputs to vector of outputs
*/
template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
OutputType *output, InputTypes... inputs) {
return RunPtx(ptx, numel, deviceID, std::vector<OutputType *>{output},
inputs...);
}
template <typename OutputType, typename... InputTypes>
inline int RunPtx(std::string const &ptx, int const numel, int deviceID,
std::vector<OutputType *> const &outputPtrs,
InputTypes... inputs) {
CUdevice device;
CUmodule cudaModule;
CUcontext context;
CUfunction function;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
checkCudaErrors(cuDeviceGet(&device, deviceID));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
// std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceGetAttribute(
&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(
&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
// std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
// << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
return 1;
}
checkCudaErrors(cuCtxCreate(&context, 0, device));
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx.c_str(), 0, 0, 0));
/* Get kernel function */
checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "KERNEL"));
CUdeviceptr numelDevPtr;
checkCudaErrors(cuMemAlloc(&numelDevPtr, sizeof(numel)));
checkCudaErrors(cuMemcpyHtoD(numelDevPtr, &numel, sizeof(numel)));
/* Kernel parameters using EXTRA field */
char paramBuffer[1024];
size_t paramBufferSize = 0;
/*Set up output and array size info*/
size_t numOutputs = outputPtrs.size();
std::vector<CUdeviceptr> outputDevPtrs(numOutputs);
for (size_t i = 0; i < numOutputs; i++) {
checkCudaErrors(cuMemAlloc(&outputDevPtrs[i], sizeof(OutputType) * numel));
CUdeviceptr opPtr =
outputDevPtrs[i]; // Fails to build on windows without this
ADD_TO_PARAM_BUFFER(opPtr, __alignof(opPtr));
}
// Expand parameter pack of inputs
CUdeviceptr inputDevPtrs[] = {
ProcessEachInput(inputs, numel, paramBuffer, paramBufferSize)...};
ADD_TO_PARAM_BUFFER(numel, __alignof(numel));
unsigned blockSizeX = unsigned(numel);
unsigned blockSizeY, blockSizeZ, gridSizeX, gridSizeY, gridSizeZ;
blockSizeY = blockSizeZ = gridSizeX = gridSizeY = gridSizeZ = 1;
void *extra[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, paramBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, ¶mBufferSize,
CU_LAUNCH_PARAM_END};
// std::cout << "Launching kernel\n";
/* Kernel launch */
checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
NULL, extra));
cudaDeviceSynchronize();
for (size_t i = 0; i < numOutputs; i++) {
checkCudaErrors(cuMemcpyDtoH(outputPtrs[i], outputDevPtrs[i],
sizeof(OutputType) * numel));
checkCudaErrors(cuMemFree(outputDevPtrs[i]));
}
for (unsigned i = 0; i < sizeof...(InputTypes); i++) {
checkCudaErrors(cuMemFree(inputDevPtrs[i]));
}
checkCudaErrors(cuModuleUnload(cudaModule));
checkCudaErrors(cuCtxDestroy(context));
return 0;
}
bool readFileToString(std::string &ptx, std::string const &pathToFile) {
try {
std::ifstream t(pathToFile);
std::stringstream buffer;
buffer << t.rdbuf();
ptx = buffer.str();
return true;
} catch (...) {
return false;
}
}
int main(void) {
std::string llvm_16_ptx, llvm_19_ptx;
readFileToString(llvm_16_ptx, "llvm16.ptx");
readFileToString(llvm_19_ptx, "llvm19.ptx");
int const numel = 1;
std::complex<int8_t> input{1, 2};
std::array<std::complex<double>, numel> output;
output.fill(std::complex<double>{0.0, 0.0});
std::cout << "Input:" << std::endl << static_cast<std::complex<int>>(input) << std::endl;
RunPtx(llvm_16_ptx, numel, /*deviceID=*/0, output.data(), input);
std::cout << "Output with LLVM 16:" << std::endl;
for (int i = 0; i < numel; i++) {
std::cout << output[i] << std::endl;
}
RunPtx(llvm_19_ptx, numel, /*deviceID=*/0, output.data(), input);
std::cout << "Output with LLVM 19:" << std::endl;
for (int i = 0; i < numel; i++) {
std::cout << output[i] << std::endl;
}
}
```
</details>
**Build commands**
```bash
# Generating PTX from the repro LLVM IR using version 16 and 19
<LLVM_16_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm16.ptx
<LLVM_19_LOCATION>/bin/llc -mcpu=sm_86 repro.ll -o llvm19.ptx
# Generating executable using NVCC
<CUDA_LOCATION>/bin/nvcc -lcuda -std=c++17 repro.cu -o repro.out
```
**Execution**
```bash
$ ./repro.out
Input:
(1,2)
Output with LLVM 16:
(1,2)
Output with LLVM 19:
(1,0)
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs