Issue 173588
Summary [opt] Incorrect global id (gid) computation when launching kernels via hipExtModuleLaunchKernel with non-divisible global size
Labels new issue
Assignees
Reporter 0oyyo0
    **Environment**
- Backend: AMDGPU
- Compiler: Clang / LLVM 17
- Runtime: HIP (ROCm)
- Kernel launch API: hipExtModuleLaunchKernel
- Global size not divisible by block size (tail workgroup exists)

**Problem Description**
When launching a kernel via hipExtModuleLaunchKernel with:
`totalThreads % blockSize != 0`
the kernel exhibits incorrect global id (gid) computation:
- Some lower gid values are executed twice
- This only happens when a tail workgroup is present(i.e., global size is not divisible by block size)
When the global size is divisible by block size, the problem disappears.

**demo**
hipcc --offload-device-only kernel.hip -o kernel.co -fno-offload-uniform-block #-mcode-object-version=5
hipcc main.cpp -o main_test
./main_test

kernel.hip
`#include <hip/hip_runtime.h>
#include <stdint.h>

extern "C" __global__
void write_gid_kernel(uint64_t* out)
{
    uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 printf("gid = %lu\n", gid);
    out[gid] = gid;
}
`
main.cpp
`#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <cstdio>
#include <cstdlib>
#include <vector>

#define HIP_CHECK(cmd)                                  \
do { \
    hipError_t e = cmd; \
    if (e != hipSuccess) { \
        printf("HIP error %s:%d: %s\n", \
               __FILE__, __LINE__, hipGetErrorString(e)); \
 exit(1);                                        \
    } \
} while (0)

int main()
{
 hipModule_t module;
    hipFunction_t kernel;

 HIP_CHECK(hipModuleLoad(&module, "kernel.co"));
 HIP_CHECK(hipModuleGetFunction(&kernel, module, "write_gid_kernel"));

 // const int block_size    = 256;
    // const int total_threads = 1000;

    const int block_size    = 512;
    const int total_threads = 822528; // 822528 % 512 = 256 

    printf("logical threads = %d\n", total_threads);
    printf("block size      = %d\n", block_size);

 uint64_t* d_out;
    HIP_CHECK(hipMalloc(&d_out, total_threads * sizeof(uint64_t)));

    void* args[] = {
        (void*)&d_out
 };

    HIP_CHECK(
        hipExtModuleLaunchKernel(
 kernel,
            total_threads, 1, 1,   
            block_size, 1, 1, 
            0,
            nullptr,
            args,
 nullptr
        )
    );

    HIP_CHECK(hipDeviceSynchronize());

 std::vector<uint64_t> h_out(total_threads, 0);
    HIP_CHECK(hipMemcpy(
 h_out.data(), d_out,
        total_threads * sizeof(uint64_t),
 hipMemcpyDeviceToHost));

 printf("\n-------------------------------------------------\n");
    // for (int i = 0; i < total_threads; ++i) {
    //     printf("h_out[%d] = %lu\n", i, h_out[i]);
    // }

    hipFree(d_out);
 hipModuleUnload(module);
    return 0;
}
`
**Observed Behavior**
When running the above demo, 
- gid in the range:[411136, 411391] is printed twice
- gid in the final tail region: [411392, totalThreads) is not printed at all
- 
**Expected Behavior**
Each gid in [0, totalThreads) should be printed exactly once



_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to